diff mbox series

[12/17] chunk-format: create read chunk API

Message ID e3475633e1d2e397c5095f34c23a19fccb2c8428.1611676886.git.gitgitgadget@gmail.com (mailing list archive)
State Superseded
Headers show
Series Refactor chunk-format into an API | expand

Commit Message

Derrick Stolee Jan. 26, 2021, 4:01 p.m. UTC
From: Derrick Stolee <dstolee@microsoft.com>

Add the capability to read the table of contents, then pair the chunks
with necessary logic using read_chunk_fn pointers. Callers will be added
in future changes, but the typical outline will be:

 1. initialize a 'struct chunkfile' with init_chunkfile(NULL).
 2. call read_table_of_contents().
 3. for each chunk to parse, call pair_chunk() with appropriate pointers.
 4. call free_chunkfile() to clear the 'struct chunkfile' data.

We are re-using the anonymous 'struct chunkfile' data, as it is internal
to the chunk-format API. This gives it essentially two modes: write and
read. If the same struct instance was used for both reads and writes,
then there would be failures.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 chunk-format.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++
 chunk-format.h | 21 +++++++++++++++++
 2 files changed, 85 insertions(+)

Comments

Taylor Blau Jan. 27, 2021, 3:02 a.m. UTC | #1
On Tue, Jan 26, 2021 at 04:01:21PM +0000, Derrick Stolee via GitGitGadget wrote:
> diff --git a/chunk-format.c b/chunk-format.c
> index 2ce37ecc6bb..674d31d5e58 100644
> --- a/chunk-format.c
> +++ b/chunk-format.c
> @@ -12,6 +12,8 @@ struct chunk_info {
>  	uint32_t id;
>  	uint64_t size;
>  	chunk_write_fn write_fn;
> +
> +	const void *start;

It may be clearer to fold both of these into an anonymous union along
with an enum to indicate which mode we're in. But, I could also buy that
that is more error prone, so perhaps just a comment along the lines of
"exactly one of these is NULL" would suffice, too.

>  };
>
>  struct chunkfile {
> @@ -89,3 +91,65 @@ int write_chunkfile(struct chunkfile *cf, void *data)
>
>  	return 0;
>  }
> +
> +int read_table_of_contents(struct chunkfile *cf,
> +			   const unsigned char *mfile,
> +			   size_t mfile_size,

Assuming that mfile and mfile_size are a pointer to a memory mapped
region and its size? If so, a nit is that I'd expect "data" and "size"
instead of "mfile".

I think that it's probably going too far to have the chunkfile API
handle mapping its own memory, so in that way I don't think it's wrong
for the callers to be handling that.

OTOH, it does seem a little weird to temporarily hand off ownership like
this. I don't think I have a better suggestion, though.

The implementation of this function looks good to me.

> +int pair_chunk(struct chunkfile *cf,
> +	       uint32_t chunk_id,
> +	       chunk_read_fn fn,
> +	       void *data)
> +{
> +	int i;
> +
> +	for (i = 0; i < cf->chunks_nr; i++) {
> +		if (cf->chunks[i].id == chunk_id)
> +			return fn(cf->chunks[i].start, cf->chunks[i].size, data);
> +	}
> +
> +	return CHUNK_NOT_FOUND;
> +}
> diff --git a/chunk-format.h b/chunk-format.h
> index bfaed672813..250e08b8e6a 100644
> --- a/chunk-format.h
> +++ b/chunk-format.h
> @@ -17,4 +17,25 @@ void add_chunk(struct chunkfile *cf,
>  	       size_t size);
>  int write_chunkfile(struct chunkfile *cf, void *data);
>
> +int read_table_of_contents(struct chunkfile *cf,
> +			   const unsigned char *mfile,
> +			   size_t mfile_size,
> +			   uint64_t toc_offset,
> +			   int toc_length);
> +
> +/*
> + * When reading a table of contents, we find the chunk with matching 'id'
> + * then call its read_fn to populate the necessary 'data' based on the
> + * chunk start and size.
> + */
> +typedef int (*chunk_read_fn)(const unsigned char *chunk_start,
> +			     size_t chunk_size, void *data);
> +
> +
> +#define CHUNK_NOT_FOUND (-2)
> +int pair_chunk(struct chunkfile *cf,
> +	       uint32_t chunk_id,
> +	       chunk_read_fn fn,
> +	       void *data);

From reading the implementation, I take it that this function calls fn
with the location and size of the requested chunk, along with the user
supplied data.

I'm not sure that "pair" gives me that same sense. Maybe "read" or
"lookup" would be better?

Dunno.

Thanks,
Taylor
diff mbox series

Patch

diff --git a/chunk-format.c b/chunk-format.c
index 2ce37ecc6bb..674d31d5e58 100644
--- a/chunk-format.c
+++ b/chunk-format.c
@@ -12,6 +12,8 @@  struct chunk_info {
 	uint32_t id;
 	uint64_t size;
 	chunk_write_fn write_fn;
+
+	const void *start;
 };
 
 struct chunkfile {
@@ -89,3 +91,65 @@  int write_chunkfile(struct chunkfile *cf, void *data)
 
 	return 0;
 }
+
+int read_table_of_contents(struct chunkfile *cf,
+			   const unsigned char *mfile,
+			   size_t mfile_size,
+			   uint64_t toc_offset,
+			   int toc_length)
+{
+	uint32_t chunk_id;
+	const unsigned char *table_of_contents = mfile + toc_offset;
+
+	ALLOC_GROW(cf->chunks, toc_length, cf->chunks_alloc);
+
+	while (toc_length--) {
+		uint64_t chunk_offset, next_chunk_offset;
+
+		chunk_id = get_be32(table_of_contents);
+		chunk_offset = get_be64(table_of_contents + 4);
+
+		if (!chunk_id) {
+			error(_("terminating chunk id appears earlier than expected"));
+			return 1;
+		}
+
+		table_of_contents += CHUNK_LOOKUP_WIDTH;
+		next_chunk_offset = get_be64(table_of_contents + 4);
+
+		if (next_chunk_offset < chunk_offset ||
+		    next_chunk_offset > mfile_size - the_hash_algo->rawsz) {
+			error(_("improper chunk offset(s) %"PRIx64" and %"PRIx64""),
+			      chunk_offset, next_chunk_offset);
+			return -1;
+		}
+
+		cf->chunks[cf->chunks_nr].id = chunk_id;
+		cf->chunks[cf->chunks_nr].start = mfile + chunk_offset;
+		cf->chunks[cf->chunks_nr].size = next_chunk_offset - chunk_offset;
+		cf->chunks_nr++;
+	}
+
+	chunk_id = get_be32(table_of_contents);
+	if (chunk_id) {
+		error(_("final chunk has non-zero id %"PRIx32""), chunk_id);
+		return -1;
+	}
+
+	return 0;
+}
+
+int pair_chunk(struct chunkfile *cf,
+	       uint32_t chunk_id,
+	       chunk_read_fn fn,
+	       void *data)
+{
+	int i;
+
+	for (i = 0; i < cf->chunks_nr; i++) {
+		if (cf->chunks[i].id == chunk_id)
+			return fn(cf->chunks[i].start, cf->chunks[i].size, data);
+	}
+
+	return CHUNK_NOT_FOUND;
+}
diff --git a/chunk-format.h b/chunk-format.h
index bfaed672813..250e08b8e6a 100644
--- a/chunk-format.h
+++ b/chunk-format.h
@@ -17,4 +17,25 @@  void add_chunk(struct chunkfile *cf,
 	       size_t size);
 int write_chunkfile(struct chunkfile *cf, void *data);
 
+int read_table_of_contents(struct chunkfile *cf,
+			   const unsigned char *mfile,
+			   size_t mfile_size,
+			   uint64_t toc_offset,
+			   int toc_length);
+
+/*
+ * When reading a table of contents, we find the chunk with matching 'id'
+ * then call its read_fn to populate the necessary 'data' based on the
+ * chunk start and size.
+ */
+typedef int (*chunk_read_fn)(const unsigned char *chunk_start,
+			     size_t chunk_size, void *data);
+
+
+#define CHUNK_NOT_FOUND (-2)
+int pair_chunk(struct chunkfile *cf,
+	       uint32_t chunk_id,
+	       chunk_read_fn fn,
+	       void *data);
+
 #endif