Message ID | 16c37d2370cf4fd5fc309ac6dc8aa6443ffcf3d7.1612535453.git.gitgitgadget@gmail.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | Refactor chunk-format into an API | expand |
> diff --git a/chunk-format.c b/chunk-format.c > new file mode 100644 > index 000000000000..6e0f1900213e > --- /dev/null > +++ b/chunk-format.c > @@ -0,0 +1,91 @@ > +#include "cache.h" > +#include "chunk-format.h" > +#include "csum-file.h" > +#define CHUNK_LOOKUP_WIDTH 12 As this macro is defined in 'chunk-format.c' it's not part of the chunkfile API. However, at the end of this patch series 'commit-graph.c' still contains: #define GRAPH_CHUNKLOOKUP_WIDTH 12 and uses it in a couple of safety checks (that didn't became part of the common chunkfile module; why?), while 'midx.c' contains: #define MIDX_CHUNKLOOKUP_WIDTH (sizeof(uint32_t) + sizeof(uint64_t)) though it's not used anymore. I think we should have only one such constant as part of the chunkfile API; and preferably use the definition from 'midx.c' as it is more informative than yet another magic number. Furthermore, being called 'CHUNK_LOOKUP_WIDTH', I had to look up the places where this constant is used to make sure that it indeed means what I suspect it means. Perhaps CHUNK_TOC_ENTRY_SIZE would be a more descriptive name for this constant. On a somewhat related note: 'commit-graph.c' and 'midx.c' still contains the constants MAX_NUM_CHUNKS and MIDX_MAX_CHUNKS, respecticely, but neither of them is used anymore.
On 2/7/2021 4:13 PM, SZEDER Gábor wrote: >> +#define CHUNK_LOOKUP_WIDTH 12 > > As this macro is defined in 'chunk-format.c' it's not part of the > chunkfile API. However, at the end of this patch series > 'commit-graph.c' still contains: > > #define GRAPH_CHUNKLOOKUP_WIDTH 12 > > and uses it in a couple of safety checks (that didn't became part of > the common chunkfile module; why?), Chunk-based files don't have a minimum size unless we know the header size and a minimum number of required chunks. I suppose that we could add this in the future to further simplify consumers of the API. > while 'midx.c' contains: > > #define MIDX_CHUNKLOOKUP_WIDTH (sizeof(uint32_t) + sizeof(uint64_t)) > > though it's not used anymore. > > I think we should have only one such constant as part of the chunkfile > API; and preferably use the definition from 'midx.c' as it is more > informative than yet another magic number. > > Furthermore, being called 'CHUNK_LOOKUP_WIDTH', I had to look up the > places where this constant is used to make sure that it indeed means > what I suspect it means. Perhaps CHUNK_TOC_ENTRY_SIZE would be a more > descriptive name for this constant. More descriptive, for sure. > On a somewhat related note: 'commit-graph.c' and 'midx.c' still > contains the constants MAX_NUM_CHUNKS and MIDX_MAX_CHUNKS, > respecticely, but neither of them is used anymore. Thanks. The following patch can be added on top of this series to clean up these dangling macros. Thanks, -Stolee --- >8 --- From 839b880ccee65eac63e8b77b12fab6531acc55b0 Mon Sep 17 00:00:00 2001 From: Derrick Stolee <dstolee@microsoft.com> Date: Mon, 8 Feb 2021 08:38:47 -0500 Subject: [PATCH] chunk-format: remove outdated macro constants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The following macros were needed by midx.c and commit-graph.c to handle their independent implementations of the chunk-based file format, but now the chunk-format API makes them obsolete: * MAX_NUM_CHUNKS * MIDX_MAX_CHUNKS * MIX_CHUNKLOOKUP_WIDTH Reported-by: SZEDER Gábor <szeder.dev@gmail.com> Signed-off-by: Derrick Stolee <dstolee@microsoft.com> --- commit-graph.c | 1 - midx.c | 2 -- 2 files changed, 3 deletions(-) diff --git a/commit-graph.c b/commit-graph.c index 32cf5091d2f..3b5a8767269 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -45,7 +45,6 @@ void git_test_write_commit_graph_or_die(void) #define GRAPH_CHUNKID_BLOOMINDEXES 0x42494458 /* "BIDX" */ #define GRAPH_CHUNKID_BLOOMDATA 0x42444154 /* "BDAT" */ #define GRAPH_CHUNKID_BASE 0x42415345 /* "BASE" */ -#define MAX_NUM_CHUNKS 9 #define GRAPH_DATA_WIDTH (the_hash_algo->rawsz + 16) diff --git a/midx.c b/midx.c index 95648a1f368..5c7f2ed2333 100644 --- a/midx.c +++ b/midx.c @@ -22,14 +22,12 @@ #define MIDX_HEADER_SIZE 12 #define MIDX_MIN_SIZE (MIDX_HEADER_SIZE + the_hash_algo->rawsz) -#define MIDX_MAX_CHUNKS 5 #define MIDX_CHUNK_ALIGNMENT 4 #define MIDX_CHUNKID_PACKNAMES 0x504e414d /* "PNAM" */ #define MIDX_CHUNKID_OIDFANOUT 0x4f494446 /* "OIDF" */ #define MIDX_CHUNKID_OIDLOOKUP 0x4f49444c /* "OIDL" */ #define MIDX_CHUNKID_OBJECTOFFSETS 0x4f4f4646 /* "OOFF" */ #define MIDX_CHUNKID_LARGEOFFSETS 0x4c4f4646 /* "LOFF" */ -#define MIDX_CHUNKLOOKUP_WIDTH (sizeof(uint32_t) + sizeof(uint64_t)) #define MIDX_CHUNK_FANOUT_SIZE (sizeof(uint32_t) * 256) #define MIDX_CHUNK_OFFSET_WIDTH (2 * sizeof(uint32_t)) #define MIDX_CHUNK_LARGE_OFFSET_WIDTH (sizeof(uint64_t))
On Mon, Feb 08, 2021 at 08:44:06AM -0500, Derrick Stolee wrote: > On 2/7/2021 4:13 PM, SZEDER Gábor wrote: > >> +#define CHUNK_LOOKUP_WIDTH 12 > > > > As this macro is defined in 'chunk-format.c' it's not part of the > > chunkfile API. However, at the end of this patch series > > 'commit-graph.c' still contains: > > > > #define GRAPH_CHUNKLOOKUP_WIDTH 12 > > > > and uses it in a couple of safety checks (that didn't became part of > > the common chunkfile module; why?), > > Chunk-based files don't have a minimum size unless we know the header > size and a minimum number of required chunks. I suppose that we could > add this in the future to further simplify consumers of the API. > > > while 'midx.c' contains: > > > > #define MIDX_CHUNKLOOKUP_WIDTH (sizeof(uint32_t) + sizeof(uint64_t)) > > > > though it's not used anymore. > > > > I think we should have only one such constant as part of the chunkfile > > API; and preferably use the definition from 'midx.c' as it is more > > informative than yet another magic number. > > > > Furthermore, being called 'CHUNK_LOOKUP_WIDTH', I had to look up the > > places where this constant is used to make sure that it indeed means > > what I suspect it means. Perhaps CHUNK_TOC_ENTRY_SIZE would be a more > > descriptive name for this constant. > > More descriptive, for sure. > > > On a somewhat related note: 'commit-graph.c' and 'midx.c' still > > contains the constants MAX_NUM_CHUNKS and MIDX_MAX_CHUNKS, > > respecticely, but neither of them is used anymore. > > Thanks. The following patch can be added on top of this series > to clean up these dangling macros. It would be better to squash this into the patches that removed the last uses of each of those constants. And it still leaves the magic number '12' duplicated in 'commit-graph.c' and 'chunk-format.c'. > Thanks, > -Stolee > > --- >8 --- > > From 839b880ccee65eac63e8b77b12fab6531acc55b0 Mon Sep 17 00:00:00 2001 > From: Derrick Stolee <dstolee@microsoft.com> > Date: Mon, 8 Feb 2021 08:38:47 -0500 > Subject: [PATCH] chunk-format: remove outdated macro constants > MIME-Version: 1.0 > Content-Type: text/plain; charset=UTF-8 > Content-Transfer-Encoding: 8bit > > The following macros were needed by midx.c and commit-graph.c to handle > their independent implementations of the chunk-based file format, but > now the chunk-format API makes them obsolete: > > * MAX_NUM_CHUNKS > * MIDX_MAX_CHUNKS > * MIX_CHUNKLOOKUP_WIDTH > > Reported-by: SZEDER Gábor <szeder.dev@gmail.com> > Signed-off-by: Derrick Stolee <dstolee@microsoft.com> > --- > commit-graph.c | 1 - > midx.c | 2 -- > 2 files changed, 3 deletions(-) > > diff --git a/commit-graph.c b/commit-graph.c > index 32cf5091d2f..3b5a8767269 100644 > --- a/commit-graph.c > +++ b/commit-graph.c > @@ -45,7 +45,6 @@ void git_test_write_commit_graph_or_die(void) > #define GRAPH_CHUNKID_BLOOMINDEXES 0x42494458 /* "BIDX" */ > #define GRAPH_CHUNKID_BLOOMDATA 0x42444154 /* "BDAT" */ > #define GRAPH_CHUNKID_BASE 0x42415345 /* "BASE" */ > -#define MAX_NUM_CHUNKS 9 > > #define GRAPH_DATA_WIDTH (the_hash_algo->rawsz + 16) > > diff --git a/midx.c b/midx.c > index 95648a1f368..5c7f2ed2333 100644 > --- a/midx.c > +++ b/midx.c > @@ -22,14 +22,12 @@ > #define MIDX_HEADER_SIZE 12 > #define MIDX_MIN_SIZE (MIDX_HEADER_SIZE + the_hash_algo->rawsz) > > -#define MIDX_MAX_CHUNKS 5 > #define MIDX_CHUNK_ALIGNMENT 4 > #define MIDX_CHUNKID_PACKNAMES 0x504e414d /* "PNAM" */ > #define MIDX_CHUNKID_OIDFANOUT 0x4f494446 /* "OIDF" */ > #define MIDX_CHUNKID_OIDLOOKUP 0x4f49444c /* "OIDL" */ > #define MIDX_CHUNKID_OBJECTOFFSETS 0x4f4f4646 /* "OOFF" */ > #define MIDX_CHUNKID_LARGEOFFSETS 0x4c4f4646 /* "LOFF" */ > -#define MIDX_CHUNKLOOKUP_WIDTH (sizeof(uint32_t) + sizeof(uint64_t)) > #define MIDX_CHUNK_FANOUT_SIZE (sizeof(uint32_t) * 256) > #define MIDX_CHUNK_OFFSET_WIDTH (2 * sizeof(uint32_t)) > #define MIDX_CHUNK_LARGE_OFFSET_WIDTH (sizeof(uint64_t)) > -- > 2.30.0.vfs.0.0.exp >
diff --git a/Makefile b/Makefile index 7b64106930a6..50a7663841e9 100644 --- a/Makefile +++ b/Makefile @@ -854,6 +854,7 @@ LIB_OBJS += bundle.o LIB_OBJS += cache-tree.o LIB_OBJS += chdir-notify.o LIB_OBJS += checkout.o +LIB_OBJS += chunk-format.o LIB_OBJS += color.o LIB_OBJS += column.o LIB_OBJS += combine-diff.o diff --git a/chunk-format.c b/chunk-format.c new file mode 100644 index 000000000000..6e0f1900213e --- /dev/null +++ b/chunk-format.c @@ -0,0 +1,91 @@ +#include "cache.h" +#include "chunk-format.h" +#include "csum-file.h" +#define CHUNK_LOOKUP_WIDTH 12 + +/* + * When writing a chunk-based file format, collect the chunks in + * an array of chunk_info structs. The size stores the _expected_ + * amount of data that will be written by write_fn. + */ +struct chunk_info { + uint32_t id; + uint64_t size; + chunk_write_fn write_fn; +}; + +struct chunkfile { + struct hashfile *f; + + struct chunk_info *chunks; + size_t chunks_nr; + size_t chunks_alloc; +}; + +struct chunkfile *init_chunkfile(struct hashfile *f) +{ + struct chunkfile *cf = xcalloc(1, sizeof(*cf)); + cf->f = f; + return cf; +} + +void free_chunkfile(struct chunkfile *cf) +{ + if (!cf) + return; + free(cf->chunks); + free(cf); +} + +int get_num_chunks(struct chunkfile *cf) +{ + return cf->chunks_nr; +} + +void add_chunk(struct chunkfile *cf, + uint32_t id, + size_t size, + chunk_write_fn fn) +{ + ALLOC_GROW(cf->chunks, cf->chunks_nr + 1, cf->chunks_alloc); + + cf->chunks[cf->chunks_nr].id = id; + cf->chunks[cf->chunks_nr].write_fn = fn; + cf->chunks[cf->chunks_nr].size = size; + cf->chunks_nr++; +} + +int write_chunkfile(struct chunkfile *cf, void *data) +{ + int i; + uint64_t cur_offset = hashfile_total(cf->f); + + /* Add the table of contents to the current offset */ + cur_offset += (cf->chunks_nr + 1) * CHUNK_LOOKUP_WIDTH; + + for (i = 0; i < cf->chunks_nr; i++) { + hashwrite_be32(cf->f, cf->chunks[i].id); + hashwrite_be64(cf->f, cur_offset); + + cur_offset += cf->chunks[i].size; + } + + /* Trailing entry marks the end of the chunks */ + hashwrite_be32(cf->f, 0); + hashwrite_be64(cf->f, cur_offset); + + for (i = 0; i < cf->chunks_nr; i++) { + off_t start_offset = hashfile_total(cf->f); + int result = cf->chunks[i].write_fn(cf->f, data); + + if (result) + return result; + + if (hashfile_total(cf->f) - start_offset != cf->chunks[i].size) + BUG("expected to write %"PRId64" bytes to chunk %"PRIx32", but wrote %"PRId64" instead", + cf->chunks[i].size, cf->chunks[i].id, + hashfile_total(cf->f) - start_offset); + } + + return 0; +} diff --git a/chunk-format.h b/chunk-format.h new file mode 100644 index 000000000000..9a1d770accec --- /dev/null +++ b/chunk-format.h @@ -0,0 +1,19 @@ +#ifndef CHUNK_FORMAT_H +#define CHUNK_FORMAT_H + +#include "git-compat-util.h" + +struct hashfile; +struct chunkfile; + +struct chunkfile *init_chunkfile(struct hashfile *f); +void free_chunkfile(struct chunkfile *cf); +int get_num_chunks(struct chunkfile *cf); +typedef int (*chunk_write_fn)(struct hashfile *f, void *data); +void add_chunk(struct chunkfile *cf, + uint32_t id, + size_t size, + chunk_write_fn fn); +int write_chunkfile(struct chunkfile *cf, void *data); + +#endif