[v4,12/17] chunk-format: create read chunk API

Message ID	3e0dbc45ce7ff5f1bda63ae8b8b45343b790417b.1613657259.git.gitgitgadget@gmail.com (mailing list archive)
State	Accepted
Commit	5f0879f54b1f5cda348528a49af57a9c3fd620f9
Headers	show Return-Path: <git-owner@kernel.org> Message-Id: <3e0dbc45ce7ff5f1bda63ae8b8b45343b790417b.1613657259.git.gitgitgadget@gmail.com> In-Reply-To: <pull.848.v4.git.1613657259.gitgitgadget@gmail.com> References: <pull.848.v3.git.1612535452.gitgitgadget@gmail.com> <pull.848.v4.git.1613657259.gitgitgadget@gmail.com> Date: Thu, 18 Feb 2021 14:07:34 +0000 Subject: [PATCH v4 12/17] chunk-format: create read chunk API Fcc: Sent Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MIME-Version: 1.0 To: git@vger.kernel.org Cc: me@ttaylorr.com, gitster@pobox.com, l.s.r@web.de, szeder.dev@gmail.com, Chris Torek <chris.torek@gmail.com>, Derrick Stolee <stolee@gmail.com>, Derrick Stolee <derrickstolee@github.com>, Derrick Stolee <dstolee@microsoft.com> Precedence: bulk From: Derrick Stolee <dstolee@microsoft.com>
Series	Refactor chunk-format into an API \| expand [v4,00/17] Refactor chunk-format into an API [v4,01/17] commit-graph: anonymize data in chunk_write_fn [v4,02/17] chunk-format: create chunk format write API [v4,03/17] commit-graph: use chunk-format write API [v4,04/17] midx: rename pack_info to write_midx_context [v4,05/17] midx: use context in write_midx_pack_names() [v4,06/17] midx: add entries to write_midx_context [v4,07/17] midx: add pack_perm to write_midx_context [v4,08/17] midx: add num_large_offsets to write_midx_context [v4,09/17] midx: return success/failure in chunk write methods [v4,10/17] midx: drop chunk progress during write [v4,11/17] midx: use chunk-format API in write_midx_internal() [v4,12/17] chunk-format: create read chunk API [v4,13/17] commit-graph: use chunk-format read API [v4,14/17] midx: use chunk-format read API [v4,15/17] midx: use 64-bit multiplication for chunk sizes [v4,16/17] chunk-format: restore duplicate chunk checks [v4,17/17] chunk-format: add technical docs

Message ID

3e0dbc45ce7ff5f1bda63ae8b8b45343b790417b.1613657259.git.gitgitgadget@gmail.com (mailing list archive)

State

Accepted

Commit

5f0879f54b1f5cda348528a49af57a9c3fd620f9

Headers

Message-Id: 
 <3e0dbc45ce7ff5f1bda63ae8b8b45343b790417b.1613657259.git.gitgitgadget@gmail.com>
In-Reply-To: <pull.848.v4.git.1613657259.gitgitgadget@gmail.com>
References: <pull.848.v3.git.1612535452.gitgitgadget@gmail.com>
        <pull.848.v4.git.1613657259.gitgitgadget@gmail.com>
Date: Thu, 18 Feb 2021 14:07:34 +0000
Subject: [PATCH v4 12/17] chunk-format: create read chunk API
Fcc: Sent
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
MIME-Version: 1.0
To: git@vger.kernel.org
Cc: me@ttaylorr.com, gitster@pobox.com, l.s.r@web.de,
        szeder.dev@gmail.com, Chris Torek <chris.torek@gmail.com>,
        Derrick Stolee <stolee@gmail.com>,
        Derrick Stolee <derrickstolee@github.com>,
        Derrick Stolee <dstolee@microsoft.com>
Precedence: bulk
From: Derrick Stolee <dstolee@microsoft.com>

Series

Refactor chunk-format into an API | expand

Commit Message

Derrick Stolee Feb. 18, 2021, 2:07 p.m. UTC

From: Derrick Stolee <dstolee@microsoft.com>

Add the capability to read the table of contents, then pair the chunks
with necessary logic using read_chunk_fn pointers. Callers will be added
in future changes, but the typical outline will be:

 1. initialize a 'struct chunkfile' with init_chunkfile(NULL).
 2. call read_table_of_contents().
 3. for each chunk to parse,
    a. call pair_chunk() to assign a pointer with the chunk position, or
    b. call read_chunk() to run a callback on the chunk start and size.
 4. call free_chunkfile() to clear the 'struct chunkfile' data.

We are re-using the anonymous 'struct chunkfile' data, as it is internal
to the chunk-format API. This gives it essentially two modes: write and
read. If the same struct instance was used for both reads and writes,
then there would be failures.

Helped-by: Junio C Hamano <gitster@pobox.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 chunk-format.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++
 chunk-format.h | 47 +++++++++++++++++++++++++++++
 2 files changed, 127 insertions(+)

diff --git a/chunk-format.c b/chunk-format.c
index 6c9b52b70c10..2c1fecf1c3e5 100644
--- a/chunk-format.c
+++ b/chunk-format.c
@@ -11,6 +11,8 @@  struct chunk_info {
 	uint32_t id;
 	uint64_t size;
 	chunk_write_fn write_fn;
+
+	const void *start;
 };
 
 struct chunkfile {
@@ -88,3 +90,81 @@  int write_chunkfile(struct chunkfile *cf, void *data)
 
 	return 0;
 }
+
+int read_table_of_contents(struct chunkfile *cf,
+			   const unsigned char *mfile,
+			   size_t mfile_size,
+			   uint64_t toc_offset,
+			   int toc_length)
+{
+	uint32_t chunk_id;
+	const unsigned char *table_of_contents = mfile + toc_offset;
+
+	ALLOC_GROW(cf->chunks, toc_length, cf->chunks_alloc);
+
+	while (toc_length--) {
+		uint64_t chunk_offset, next_chunk_offset;
+
+		chunk_id = get_be32(table_of_contents);
+		chunk_offset = get_be64(table_of_contents + 4);
+
+		if (!chunk_id) {
+			error(_("terminating chunk id appears earlier than expected"));
+			return 1;
+		}
+
+		table_of_contents += CHUNK_TOC_ENTRY_SIZE;
+		next_chunk_offset = get_be64(table_of_contents + 4);
+
+		if (next_chunk_offset < chunk_offset ||
+		    next_chunk_offset > mfile_size - the_hash_algo->rawsz) {
+			error(_("improper chunk offset(s) %"PRIx64" and %"PRIx64""),
+			      chunk_offset, next_chunk_offset);
+			return -1;
+		}
+
+		cf->chunks[cf->chunks_nr].id = chunk_id;
+		cf->chunks[cf->chunks_nr].start = mfile + chunk_offset;
+		cf->chunks[cf->chunks_nr].size = next_chunk_offset - chunk_offset;
+		cf->chunks_nr++;
+	}
+
+	chunk_id = get_be32(table_of_contents);
+	if (chunk_id) {
+		error(_("final chunk has non-zero id %"PRIx32""), chunk_id);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int pair_chunk_fn(const unsigned char *chunk_start,
+			 size_t chunk_size,
+			 void *data)
+{
+	const unsigned char **p = data;
+	*p = chunk_start;
+	return 0;
+}
+
+int pair_chunk(struct chunkfile *cf,
+	       uint32_t chunk_id,
+	       const unsigned char **p)
+{
+	return read_chunk(cf, chunk_id, pair_chunk_fn, p);
+}
+
+int read_chunk(struct chunkfile *cf,
+	       uint32_t chunk_id,
+	       chunk_read_fn fn,
+	       void *data)
+{
+	int i;
+
+	for (i = 0; i < cf->chunks_nr; i++) {
+		if (cf->chunks[i].id == chunk_id)
+			return fn(cf->chunks[i].start, cf->chunks[i].size, data);
+	}
+
+	return CHUNK_NOT_FOUND;
+}
diff --git a/chunk-format.h b/chunk-format.h
index ce598b66d9f8..9ccbe0037792 100644
--- a/chunk-format.h
+++ b/chunk-format.h
@@ -8,6 +8,20 @@  struct chunkfile;
 
 #define CHUNK_TOC_ENTRY_SIZE (sizeof(uint32_t) + sizeof(uint64_t))
 
+/*
+ * Initialize a 'struct chunkfile' for writing _or_ reading a file
+ * with the chunk format.
+ *
+ * If writing a file, supply a non-NULL 'struct hashfile *' that will
+ * be used to write.
+ *
+ * If reading a file, use a NULL 'struct hashfile *' and then call
+ * read_table_of_contents(). Supply the memory-mapped data to the
+ * pair_chunk() or read_chunk() methods, as appropriate.
+ *
+ * DO NOT MIX THESE MODES. Use different 'struct chunkfile' instances
+ * for reading and writing.
+ */
 struct chunkfile *init_chunkfile(struct hashfile *f);
 void free_chunkfile(struct chunkfile *cf);
 int get_num_chunks(struct chunkfile *cf);
@@ -18,4 +32,37 @@  void add_chunk(struct chunkfile *cf,
 	       chunk_write_fn fn);
 int write_chunkfile(struct chunkfile *cf, void *data);
 
+int read_table_of_contents(struct chunkfile *cf,
+			   const unsigned char *mfile,
+			   size_t mfile_size,
+			   uint64_t toc_offset,
+			   int toc_length);
+
+#define CHUNK_NOT_FOUND (-2)
+
+/*
+ * Find 'chunk_id' in the given chunkfile and assign the
+ * given pointer to the position in the mmap'd file where
+ * that chunk begins.
+ *
+ * Returns CHUNK_NOT_FOUND if the chunk does not exist.
+ */
+int pair_chunk(struct chunkfile *cf,
+	       uint32_t chunk_id,
+	       const unsigned char **p);
+
+typedef int (*chunk_read_fn)(const unsigned char *chunk_start,
+			     size_t chunk_size, void *data);
+/*
+ * Find 'chunk_id' in the given chunkfile and call the
+ * given chunk_read_fn method with the information for
+ * that chunk.
+ *
+ * Returns CHUNK_NOT_FOUND if the chunk does not exist.
+ */
+int read_chunk(struct chunkfile *cf,
+	       uint32_t chunk_id,
+	       chunk_read_fn fn,
+	       void *data);
+
 #endif

[v4,12/17] chunk-format: create read chunk API

Commit Message

Patch