diff mbox series

[v3,02/17] chunk-format: create chunk format write API

Message ID 16c37d2370cf4fd5fc309ac6dc8aa6443ffcf3d7.1612535453.git.gitgitgadget@gmail.com (mailing list archive)
State Superseded
Headers show
Series Refactor chunk-format into an API | expand

Commit Message

Derrick Stolee Feb. 5, 2021, 2:30 p.m. UTC
From: Derrick Stolee <dstolee@microsoft.com>

In anticipation of combining the logic from the commit-graph and
multi-pack-index file formats, create a new chunk-format API. Use a
'struct chunkfile' pointer to keep track of data that has been
registered for writes. This struct is anonymous outside of
chunk-format.c to ensure no user attempts to interfere with the data.

The next change will use this API in commit-graph.c, but the general
approach is:

 1. initialize the chunkfile with init_chunkfile(f).
 2. add chunks in the intended writing order with add_chunk().
 3. write any header information to the hashfile f.
 4. write the chunkfile data using write_chunkfile().
 5. free the chunkfile struct using free_chunkfile().

Helped-by: Taylor Blau <me@ttaylorr.com>
Helped-by: Junio C Hamano <gitster@pobox.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 Makefile       |  1 +
 chunk-format.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++
 chunk-format.h | 19 +++++++++++
 3 files changed, 111 insertions(+)
 create mode 100644 chunk-format.c
 create mode 100644 chunk-format.h

Comments

SZEDER Gábor Feb. 7, 2021, 9:13 p.m. UTC | #1
> diff --git a/chunk-format.c b/chunk-format.c
> new file mode 100644
> index 000000000000..6e0f1900213e
> --- /dev/null
> +++ b/chunk-format.c
> @@ -0,0 +1,91 @@
> +#include "cache.h"
> +#include "chunk-format.h"
> +#include "csum-file.h"
> +#define CHUNK_LOOKUP_WIDTH 12

As this macro is defined in 'chunk-format.c' it's not part of the
chunkfile API.  However, at the end of this patch series
'commit-graph.c' still contains:

  #define GRAPH_CHUNKLOOKUP_WIDTH 12

and uses it in a couple of safety checks (that didn't became part of
the common chunkfile module; why?), while 'midx.c' contains:

  #define MIDX_CHUNKLOOKUP_WIDTH (sizeof(uint32_t) + sizeof(uint64_t))

though it's not used anymore.

I think we should have only one such constant as part of the chunkfile
API; and preferably use the definition from 'midx.c' as it is more
informative than yet another magic number.

Furthermore, being called 'CHUNK_LOOKUP_WIDTH', I had to look up the
places where this constant is used to make sure that it indeed means
what I suspect it means.  Perhaps CHUNK_TOC_ENTRY_SIZE would be a more
descriptive name for this constant.


On a somewhat related note: 'commit-graph.c' and 'midx.c' still
contains the constants MAX_NUM_CHUNKS and MIDX_MAX_CHUNKS,
respecticely, but neither of them is used anymore.
Derrick Stolee Feb. 8, 2021, 1:44 p.m. UTC | #2
On 2/7/2021 4:13 PM, SZEDER Gábor wrote:
>> +#define CHUNK_LOOKUP_WIDTH 12
> 
> As this macro is defined in 'chunk-format.c' it's not part of the
> chunkfile API.  However, at the end of this patch series
> 'commit-graph.c' still contains:
> 
>   #define GRAPH_CHUNKLOOKUP_WIDTH 12
> 
> and uses it in a couple of safety checks (that didn't became part of
> the common chunkfile module; why?),

Chunk-based files don't have a minimum size unless we know the header
size and a minimum number of required chunks. I suppose that we could
add this in the future to further simplify consumers of the API.

> while 'midx.c' contains:
> 
>   #define MIDX_CHUNKLOOKUP_WIDTH (sizeof(uint32_t) + sizeof(uint64_t))
> 
> though it's not used anymore.
> 
> I think we should have only one such constant as part of the chunkfile
> API; and preferably use the definition from 'midx.c' as it is more
> informative than yet another magic number.
> 
> Furthermore, being called 'CHUNK_LOOKUP_WIDTH', I had to look up the
> places where this constant is used to make sure that it indeed means
> what I suspect it means.  Perhaps CHUNK_TOC_ENTRY_SIZE would be a more
> descriptive name for this constant.

More descriptive, for sure.

> On a somewhat related note: 'commit-graph.c' and 'midx.c' still
> contains the constants MAX_NUM_CHUNKS and MIDX_MAX_CHUNKS,
> respecticely, but neither of them is used anymore.

Thanks. The following patch can be added on top of this series
to clean up these dangling macros.

Thanks,
-Stolee

--- >8 ---

From 839b880ccee65eac63e8b77b12fab6531acc55b0 Mon Sep 17 00:00:00 2001
From: Derrick Stolee <dstolee@microsoft.com>
Date: Mon, 8 Feb 2021 08:38:47 -0500
Subject: [PATCH] chunk-format: remove outdated macro constants
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The following macros were needed by midx.c and commit-graph.c to handle
their independent implementations of the chunk-based file format, but
now the chunk-format API makes them obsolete:

* MAX_NUM_CHUNKS
* MIDX_MAX_CHUNKS
* MIX_CHUNKLOOKUP_WIDTH

Reported-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 commit-graph.c | 1 -
 midx.c         | 2 --
 2 files changed, 3 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index 32cf5091d2f..3b5a8767269 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -45,7 +45,6 @@ void git_test_write_commit_graph_or_die(void)
 #define GRAPH_CHUNKID_BLOOMINDEXES 0x42494458 /* "BIDX" */
 #define GRAPH_CHUNKID_BLOOMDATA 0x42444154 /* "BDAT" */
 #define GRAPH_CHUNKID_BASE 0x42415345 /* "BASE" */
-#define MAX_NUM_CHUNKS 9
 
 #define GRAPH_DATA_WIDTH (the_hash_algo->rawsz + 16)
 
diff --git a/midx.c b/midx.c
index 95648a1f368..5c7f2ed2333 100644
--- a/midx.c
+++ b/midx.c
@@ -22,14 +22,12 @@
 #define MIDX_HEADER_SIZE 12
 #define MIDX_MIN_SIZE (MIDX_HEADER_SIZE + the_hash_algo->rawsz)
 
-#define MIDX_MAX_CHUNKS 5
 #define MIDX_CHUNK_ALIGNMENT 4
 #define MIDX_CHUNKID_PACKNAMES 0x504e414d /* "PNAM" */
 #define MIDX_CHUNKID_OIDFANOUT 0x4f494446 /* "OIDF" */
 #define MIDX_CHUNKID_OIDLOOKUP 0x4f49444c /* "OIDL" */
 #define MIDX_CHUNKID_OBJECTOFFSETS 0x4f4f4646 /* "OOFF" */
 #define MIDX_CHUNKID_LARGEOFFSETS 0x4c4f4646 /* "LOFF" */
-#define MIDX_CHUNKLOOKUP_WIDTH (sizeof(uint32_t) + sizeof(uint64_t))
 #define MIDX_CHUNK_FANOUT_SIZE (sizeof(uint32_t) * 256)
 #define MIDX_CHUNK_OFFSET_WIDTH (2 * sizeof(uint32_t))
 #define MIDX_CHUNK_LARGE_OFFSET_WIDTH (sizeof(uint64_t))
SZEDER Gábor Feb. 11, 2021, 7:43 p.m. UTC | #3
On Mon, Feb 08, 2021 at 08:44:06AM -0500, Derrick Stolee wrote:
> On 2/7/2021 4:13 PM, SZEDER Gábor wrote:
> >> +#define CHUNK_LOOKUP_WIDTH 12
> > 
> > As this macro is defined in 'chunk-format.c' it's not part of the
> > chunkfile API.  However, at the end of this patch series
> > 'commit-graph.c' still contains:
> > 
> >   #define GRAPH_CHUNKLOOKUP_WIDTH 12
> > 
> > and uses it in a couple of safety checks (that didn't became part of
> > the common chunkfile module; why?),
> 
> Chunk-based files don't have a minimum size unless we know the header
> size and a minimum number of required chunks. I suppose that we could
> add this in the future to further simplify consumers of the API.
> 
> > while 'midx.c' contains:
> > 
> >   #define MIDX_CHUNKLOOKUP_WIDTH (sizeof(uint32_t) + sizeof(uint64_t))
> > 
> > though it's not used anymore.
> > 
> > I think we should have only one such constant as part of the chunkfile
> > API; and preferably use the definition from 'midx.c' as it is more
> > informative than yet another magic number.
> > 
> > Furthermore, being called 'CHUNK_LOOKUP_WIDTH', I had to look up the
> > places where this constant is used to make sure that it indeed means
> > what I suspect it means.  Perhaps CHUNK_TOC_ENTRY_SIZE would be a more
> > descriptive name for this constant.
> 
> More descriptive, for sure.
> 
> > On a somewhat related note: 'commit-graph.c' and 'midx.c' still
> > contains the constants MAX_NUM_CHUNKS and MIDX_MAX_CHUNKS,
> > respecticely, but neither of them is used anymore.
> 
> Thanks. The following patch can be added on top of this series
> to clean up these dangling macros.

It would be better to squash this into the patches that removed the
last uses of each of those constants.

And it still leaves the magic number '12' duplicated in
'commit-graph.c' and 'chunk-format.c'.

> Thanks,
> -Stolee
> 
> --- >8 ---
> 
> From 839b880ccee65eac63e8b77b12fab6531acc55b0 Mon Sep 17 00:00:00 2001
> From: Derrick Stolee <dstolee@microsoft.com>
> Date: Mon, 8 Feb 2021 08:38:47 -0500
> Subject: [PATCH] chunk-format: remove outdated macro constants
> MIME-Version: 1.0
> Content-Type: text/plain; charset=UTF-8
> Content-Transfer-Encoding: 8bit
> 
> The following macros were needed by midx.c and commit-graph.c to handle
> their independent implementations of the chunk-based file format, but
> now the chunk-format API makes them obsolete:
> 
> * MAX_NUM_CHUNKS
> * MIDX_MAX_CHUNKS
> * MIX_CHUNKLOOKUP_WIDTH
> 
> Reported-by: SZEDER Gábor <szeder.dev@gmail.com>
> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
> ---
>  commit-graph.c | 1 -
>  midx.c         | 2 --
>  2 files changed, 3 deletions(-)
> 
> diff --git a/commit-graph.c b/commit-graph.c
> index 32cf5091d2f..3b5a8767269 100644
> --- a/commit-graph.c
> +++ b/commit-graph.c
> @@ -45,7 +45,6 @@ void git_test_write_commit_graph_or_die(void)
>  #define GRAPH_CHUNKID_BLOOMINDEXES 0x42494458 /* "BIDX" */
>  #define GRAPH_CHUNKID_BLOOMDATA 0x42444154 /* "BDAT" */
>  #define GRAPH_CHUNKID_BASE 0x42415345 /* "BASE" */
> -#define MAX_NUM_CHUNKS 9
>  
>  #define GRAPH_DATA_WIDTH (the_hash_algo->rawsz + 16)
>  
> diff --git a/midx.c b/midx.c
> index 95648a1f368..5c7f2ed2333 100644
> --- a/midx.c
> +++ b/midx.c
> @@ -22,14 +22,12 @@
>  #define MIDX_HEADER_SIZE 12
>  #define MIDX_MIN_SIZE (MIDX_HEADER_SIZE + the_hash_algo->rawsz)
>  
> -#define MIDX_MAX_CHUNKS 5
>  #define MIDX_CHUNK_ALIGNMENT 4
>  #define MIDX_CHUNKID_PACKNAMES 0x504e414d /* "PNAM" */
>  #define MIDX_CHUNKID_OIDFANOUT 0x4f494446 /* "OIDF" */
>  #define MIDX_CHUNKID_OIDLOOKUP 0x4f49444c /* "OIDL" */
>  #define MIDX_CHUNKID_OBJECTOFFSETS 0x4f4f4646 /* "OOFF" */
>  #define MIDX_CHUNKID_LARGEOFFSETS 0x4c4f4646 /* "LOFF" */
> -#define MIDX_CHUNKLOOKUP_WIDTH (sizeof(uint32_t) + sizeof(uint64_t))
>  #define MIDX_CHUNK_FANOUT_SIZE (sizeof(uint32_t) * 256)
>  #define MIDX_CHUNK_OFFSET_WIDTH (2 * sizeof(uint32_t))
>  #define MIDX_CHUNK_LARGE_OFFSET_WIDTH (sizeof(uint64_t))
> -- 
> 2.30.0.vfs.0.0.exp
>
diff mbox series

Patch

diff --git a/Makefile b/Makefile
index 7b64106930a6..50a7663841e9 100644
--- a/Makefile
+++ b/Makefile
@@ -854,6 +854,7 @@  LIB_OBJS += bundle.o
 LIB_OBJS += cache-tree.o
 LIB_OBJS += chdir-notify.o
 LIB_OBJS += checkout.o
+LIB_OBJS += chunk-format.o
 LIB_OBJS += color.o
 LIB_OBJS += column.o
 LIB_OBJS += combine-diff.o
diff --git a/chunk-format.c b/chunk-format.c
new file mode 100644
index 000000000000..6e0f1900213e
--- /dev/null
+++ b/chunk-format.c
@@ -0,0 +1,91 @@ 
+#include "cache.h"
+#include "chunk-format.h"
+#include "csum-file.h"
+#define CHUNK_LOOKUP_WIDTH 12
+
+/*
+ * When writing a chunk-based file format, collect the chunks in
+ * an array of chunk_info structs. The size stores the _expected_
+ * amount of data that will be written by write_fn.
+ */
+struct chunk_info {
+	uint32_t id;
+	uint64_t size;
+	chunk_write_fn write_fn;
+};
+
+struct chunkfile {
+	struct hashfile *f;
+
+	struct chunk_info *chunks;
+	size_t chunks_nr;
+	size_t chunks_alloc;
+};
+
+struct chunkfile *init_chunkfile(struct hashfile *f)
+{
+	struct chunkfile *cf = xcalloc(1, sizeof(*cf));
+	cf->f = f;
+	return cf;
+}
+
+void free_chunkfile(struct chunkfile *cf)
+{
+	if (!cf)
+		return;
+	free(cf->chunks);
+	free(cf);
+}
+
+int get_num_chunks(struct chunkfile *cf)
+{
+	return cf->chunks_nr;
+}
+
+void add_chunk(struct chunkfile *cf,
+	       uint32_t id,
+	       size_t size,
+	       chunk_write_fn fn)
+{
+	ALLOC_GROW(cf->chunks, cf->chunks_nr + 1, cf->chunks_alloc);
+
+	cf->chunks[cf->chunks_nr].id = id;
+	cf->chunks[cf->chunks_nr].write_fn = fn;
+	cf->chunks[cf->chunks_nr].size = size;
+	cf->chunks_nr++;
+}
+
+int write_chunkfile(struct chunkfile *cf, void *data)
+{
+	int i;
+	uint64_t cur_offset = hashfile_total(cf->f);
+
+	/* Add the table of contents to the current offset */
+	cur_offset += (cf->chunks_nr + 1) * CHUNK_LOOKUP_WIDTH;
+
+	for (i = 0; i < cf->chunks_nr; i++) {
+		hashwrite_be32(cf->f, cf->chunks[i].id);
+		hashwrite_be64(cf->f, cur_offset);
+
+		cur_offset += cf->chunks[i].size;
+	}
+
+	/* Trailing entry marks the end of the chunks */
+	hashwrite_be32(cf->f, 0);
+	hashwrite_be64(cf->f, cur_offset);
+
+	for (i = 0; i < cf->chunks_nr; i++) {
+		off_t start_offset = hashfile_total(cf->f);
+		int result = cf->chunks[i].write_fn(cf->f, data);
+
+		if (result)
+			return result;
+
+		if (hashfile_total(cf->f) - start_offset != cf->chunks[i].size)
+			BUG("expected to write %"PRId64" bytes to chunk %"PRIx32", but wrote %"PRId64" instead",
+			    cf->chunks[i].size, cf->chunks[i].id,
+			    hashfile_total(cf->f) - start_offset);
+	}
+
+	return 0;
+}
diff --git a/chunk-format.h b/chunk-format.h
new file mode 100644
index 000000000000..9a1d770accec
--- /dev/null
+++ b/chunk-format.h
@@ -0,0 +1,19 @@ 
+#ifndef CHUNK_FORMAT_H
+#define CHUNK_FORMAT_H
+
+#include "git-compat-util.h"
+
+struct hashfile;
+struct chunkfile;
+
+struct chunkfile *init_chunkfile(struct hashfile *f);
+void free_chunkfile(struct chunkfile *cf);
+int get_num_chunks(struct chunkfile *cf);
+typedef int (*chunk_write_fn)(struct hashfile *f, void *data);
+void add_chunk(struct chunkfile *cf,
+	       uint32_t id,
+	       size_t size,
+	       chunk_write_fn fn);
+int write_chunkfile(struct chunkfile *cf, void *data);
+
+#endif