diff mbox series

[v3,04/17] chunk-format.h: extract oid_version()

Message ID 135a07276b0a40b04f2c28d4f48c26b1af76c12c.1646266835.git.me@ttaylorr.com (mailing list archive)
State Accepted
Commit d9fef9d90d27b6794350ec3bc622042b79397088
Headers show
Series cruft packs | expand

Commit Message

Taylor Blau March 3, 2022, 12:20 a.m. UTC
There are three definitions of an identical function which converts
`the_hash_algo` into either 1 (for SHA-1) or 2 (for SHA-256). There is a
copy of this function for writing both the commit-graph and
multi-pack-index file, and another inline definition used to write the
.rev header.

Consolidate these into a single definition in chunk-format.h. It's not
clear that this is the best header to define this function in, but it
should do for now.

(Worth noting, the .rev caller expects a 4-byte unsigned, but the other
two callers work with a single unsigned byte. The consolidated version
uses the latter type, and lets the compiler widen it when required).

Another caller will be added in a subsequent patch.

Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 chunk-format.c | 12 ++++++++++++
 chunk-format.h |  3 +++
 commit-graph.c | 18 +++---------------
 midx.c         | 18 +++---------------
 pack-write.c   | 15 ++-------------
 5 files changed, 23 insertions(+), 43 deletions(-)

Comments

Ævar Arnfjörð Bjarmason March 3, 2022, 4:30 p.m. UTC | #1
On Wed, Mar 02 2022, Taylor Blau wrote:

> Consolidate these into a single definition in chunk-format.h. It's not
> clear that this is the best header to define this function in, but it
> should do for now.
> [...]
> +
> +uint8_t oid_version(const struct git_hash_algo *algop)
> +{
> +	switch (hash_algo_by_ptr(algop)) {
> +	case GIT_HASH_SHA1:
> +		return 1;
> +	case GIT_HASH_SHA256:
> +		return 2;

Not a new issue, but I wonder why these don't return hash_algo_by_ptr
aka GIT_HASH_WHATEVER here. I.e. this is the same as this more
straightforward & obvious code that avoids re-hardcoding the magic
constants:

	const int algo = hash_algo_by_ptr(algop)

	switch (algo) {
	case GIT_HASH_SHA1:
	case GIT_HASH_SHA256:
		return algo;
	default:
        [...]
        }

Probably best left as a later cleanup. FWIW I came up with this on top
of my designated init series:

diff --git a/hash.h b/hash.h
index 5d40368f18a..fd710ec6ae8 100644
--- a/hash.h
+++ b/hash.h
@@ -86,14 +86,18 @@ static inline void git_SHA256_Clone(git_SHA256_CTX *dst, const git_SHA256_CTX *s
  * field for being non-zero.  Use the name field for user-visible situations and
  * the format_id field for fixed-length fields on disk.
  */
-/* An unknown hash function. */
-#define GIT_HASH_UNKNOWN 0
-/* SHA-1 */
-#define GIT_HASH_SHA1 1
-/* SHA-256  */
-#define GIT_HASH_SHA256 2
-/* Number of algorithms supported (including unknown). */
-#define GIT_HASH_NALGOS (GIT_HASH_SHA256 + 1)
+enum git_hash_algo_name {
+	/* An unknown hash function. */
+	GIT_HASH_UNKNOWN,
+	/* SHA-1 */
+	GIT_HASH_SHA1,
+	GIT_HASH_SHA256,
+	/*
+	 * Number of algorithms supported (including unknown). This
+	 * must be kept last!
+	 */
+	GIT_HASH_NALGOS,
+};
 
 /* "sha1", big-endian */
 #define GIT_SHA1_FORMAT_ID 0x73686131
diff --git a/object-file.c b/object-file.c
index 5074471b471..f2d54a86969 100644
--- a/object-file.c
+++ b/object-file.c
@@ -166,7 +166,7 @@ static void git_hash_unknown_final_oid(struct object_id *oid, git_hash_ctx *ctx)
 }
 
 const struct git_hash_algo hash_algos[GIT_HASH_NALGOS] = {
-	{
+	[GIT_HASH_UNKNOWN] = {
 		.name = NULL,
 		.format_id = 0x00000000,
 		.rawsz = 0,
@@ -181,7 +181,7 @@ const struct git_hash_algo hash_algos[GIT_HASH_NALGOS] = {
 		.empty_blob = NULL,
 		.null_oid = NULL,
 	},
-	{
+	[GIT_HASH_SHA1] = {
 		.name = "sha1",
 		.format_id = GIT_SHA1_FORMAT_ID,
 		.rawsz = GIT_SHA1_RAWSZ,
@@ -196,7 +196,7 @@ const struct git_hash_algo hash_algos[GIT_HASH_NALGOS] = {
 		.empty_blob = &empty_blob_oid,
 		.null_oid = &null_oid_sha1,
 	},
-	{
+	[GIT_HASH_SHA256] = {
 		.name = "sha256",
 		.format_id = GIT_SHA256_FORMAT_ID,
 		.rawsz = GIT_SHA256_RAWSZ,
Taylor Blau March 3, 2022, 11:32 p.m. UTC | #2
On Thu, Mar 03, 2022 at 05:30:44PM +0100, Ævar Arnfjörð Bjarmason wrote:
>
> On Wed, Mar 02 2022, Taylor Blau wrote:
>
> > Consolidate these into a single definition in chunk-format.h. It's not
> > clear that this is the best header to define this function in, but it
> > should do for now.
> > [...]
> > +
> > +uint8_t oid_version(const struct git_hash_algo *algop)
> > +{
> > +	switch (hash_algo_by_ptr(algop)) {
> > +	case GIT_HASH_SHA1:
> > +		return 1;
> > +	case GIT_HASH_SHA256:
> > +		return 2;
>
> Not a new issue, but I wonder why these don't return hash_algo_by_ptr
> aka GIT_HASH_WHATEVER here. I.e. this is the same as this more
> straightforward & obvious code that avoids re-hardcoding the magic
> constants:

Hmm. Certainly the value returned by hash_algo_by_ptr() works for SHA-1
and SHA-256, but writes may want to use a different value for future
hashes. Not that this couldn't be changed then, but my feeling is that
the existing code is clearer since it avoids the reader having to jump
to hash_algo_by_ptr()'s implementation to figure out what it returns.

Thanks,
Taylor
Junio C Hamano March 4, 2022, 12:16 a.m. UTC | #3
Taylor Blau <me@ttaylorr.com> writes:

> On Thu, Mar 03, 2022 at 05:30:44PM +0100, Ævar Arnfjörð Bjarmason wrote:
>>
>> On Wed, Mar 02 2022, Taylor Blau wrote:
>>
>> > Consolidate these into a single definition in chunk-format.h. It's not
>> > clear that this is the best header to define this function in, but it
>> > should do for now.
>> > [...]
>> > +
>> > +uint8_t oid_version(const struct git_hash_algo *algop)
>> > +{
>> > +	switch (hash_algo_by_ptr(algop)) {
>> > +	case GIT_HASH_SHA1:
>> > +		return 1;
>> > +	case GIT_HASH_SHA256:
>> > +		return 2;
>>
>> Not a new issue, but I wonder why these don't return hash_algo_by_ptr
>> aka GIT_HASH_WHATEVER here. I.e. this is the same as this more
>> straightforward & obvious code that avoids re-hardcoding the magic
>> constants:
>
> Hmm. Certainly the value returned by hash_algo_by_ptr() works for SHA-1
> and SHA-256, but writes may want to use a different value for future
> hashes. Not that this couldn't be changed then, but my feeling is that
> the existing code is clearer since it avoids the reader having to jump
> to hash_algo_by_ptr()'s implementation to figure out what it returns.

If we promise that everywhere in file formats where we identify what
hash is used, we write "1" for SHA1 and "2" for SHA256, it would be
natural to define GIT_HASH_SHA1 to "1" and GIT_HASH_SHA256 to "2".

And readers do not have to "figure out", if that is a clearly
written guideline to represent the hash used in file formats.  As
written, the readers who -assumes- such a guideline is there must
figure out from hash.h that GIT_HASH_SHA1 is 1 and GIT_HASH_SHA256
is 2 to be convinced that the above code is correct.

Now, hash.h says GIT_HASH_SHA1 is 1 and GIT_HASH_SHA256 is 2.  So

	int oidv = hash_algo_by_ptr(algop)
	switch (oidv) {
	case GIT_HASH_SHA1:
	case GIT_HASH_SHA256:
		return oidv;
	default:
		die();
	}

should work already.  To put it differently, if this didn't work, we
should renumber GIT_HASH_SHA1 and GIT_HASH_SHA256 to make it work, I
would think.  If not, we have a huge mess on our hands, as constants
used in on-disk file formats is hard (almost impossible) to change.

An overly generic function name oid_version() cannot be justified
unless the same constants are used everywhere.  I see hits from 'git
grep oid_version' in

    chunk-format.c (obviously)
    commit-graph.c
    midx.c
    pack-write.c

so presumably these types of files are using the "canonical"
numbering.

And when we introduce GIT_HASH_SHA3 or whatever, we should give it a
number that this function can return (i.e. from the range 3..255).
diff mbox series

Patch

diff --git a/chunk-format.c b/chunk-format.c
index 1c3dca62e2..0275b74a89 100644
--- a/chunk-format.c
+++ b/chunk-format.c
@@ -181,3 +181,15 @@  int read_chunk(struct chunkfile *cf,
 
 	return CHUNK_NOT_FOUND;
 }
+
+uint8_t oid_version(const struct git_hash_algo *algop)
+{
+	switch (hash_algo_by_ptr(algop)) {
+	case GIT_HASH_SHA1:
+		return 1;
+	case GIT_HASH_SHA256:
+		return 2;
+	default:
+		die(_("invalid hash version"));
+	}
+}
diff --git a/chunk-format.h b/chunk-format.h
index 9ccbe00377..7885aa0848 100644
--- a/chunk-format.h
+++ b/chunk-format.h
@@ -2,6 +2,7 @@ 
 #define CHUNK_FORMAT_H
 
 #include "git-compat-util.h"
+#include "hash.h"
 
 struct hashfile;
 struct chunkfile;
@@ -65,4 +66,6 @@  int read_chunk(struct chunkfile *cf,
 	       chunk_read_fn fn,
 	       void *data);
 
+uint8_t oid_version(const struct git_hash_algo *algop);
+
 #endif
diff --git a/commit-graph.c b/commit-graph.c
index 265c010122..f678d2c4a1 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -193,18 +193,6 @@  char *get_commit_graph_chain_filename(struct object_directory *odb)
 	return xstrfmt("%s/info/commit-graphs/commit-graph-chain", odb->path);
 }
 
-static uint8_t oid_version(void)
-{
-	switch (hash_algo_by_ptr(the_hash_algo)) {
-	case GIT_HASH_SHA1:
-		return 1;
-	case GIT_HASH_SHA256:
-		return 2;
-	default:
-		die(_("invalid hash version"));
-	}
-}
-
 static struct commit_graph *alloc_commit_graph(void)
 {
 	struct commit_graph *g = xcalloc(1, sizeof(*g));
@@ -365,9 +353,9 @@  struct commit_graph *parse_commit_graph(struct repository *r,
 	}
 
 	hash_version = *(unsigned char*)(data + 5);
-	if (hash_version != oid_version()) {
+	if (hash_version != oid_version(the_hash_algo)) {
 		error(_("commit-graph hash version %X does not match version %X"),
-		      hash_version, oid_version());
+		      hash_version, oid_version(the_hash_algo));
 		return NULL;
 	}
 
@@ -1911,7 +1899,7 @@  static int write_commit_graph_file(struct write_commit_graph_context *ctx)
 	hashwrite_be32(f, GRAPH_SIGNATURE);
 
 	hashwrite_u8(f, GRAPH_VERSION);
-	hashwrite_u8(f, oid_version());
+	hashwrite_u8(f, oid_version(the_hash_algo));
 	hashwrite_u8(f, get_num_chunks(cf));
 	hashwrite_u8(f, ctx->num_commit_graphs_after - 1);
 
diff --git a/midx.c b/midx.c
index 865170bad0..65e670c5e2 100644
--- a/midx.c
+++ b/midx.c
@@ -41,18 +41,6 @@ 
 
 #define PACK_EXPIRED UINT_MAX
 
-static uint8_t oid_version(void)
-{
-	switch (hash_algo_by_ptr(the_hash_algo)) {
-	case GIT_HASH_SHA1:
-		return 1;
-	case GIT_HASH_SHA256:
-		return 2;
-	default:
-		die(_("invalid hash version"));
-	}
-}
-
 const unsigned char *get_midx_checksum(struct multi_pack_index *m)
 {
 	return m->data + m->data_len - the_hash_algo->rawsz;
@@ -134,9 +122,9 @@  struct multi_pack_index *load_multi_pack_index(const char *object_dir, int local
 		      m->version);
 
 	hash_version = m->data[MIDX_BYTE_HASH_VERSION];
-	if (hash_version != oid_version()) {
+	if (hash_version != oid_version(the_hash_algo)) {
 		error(_("multi-pack-index hash version %u does not match version %u"),
-		      hash_version, oid_version());
+		      hash_version, oid_version(the_hash_algo));
 		goto cleanup_fail;
 	}
 	m->hash_len = the_hash_algo->rawsz;
@@ -420,7 +408,7 @@  static size_t write_midx_header(struct hashfile *f,
 {
 	hashwrite_be32(f, MIDX_SIGNATURE);
 	hashwrite_u8(f, MIDX_VERSION);
-	hashwrite_u8(f, oid_version());
+	hashwrite_u8(f, oid_version(the_hash_algo));
 	hashwrite_u8(f, num_chunks);
 	hashwrite_u8(f, 0); /* unused */
 	hashwrite_be32(f, num_packs);
diff --git a/pack-write.c b/pack-write.c
index d594e3008e..ff305b404c 100644
--- a/pack-write.c
+++ b/pack-write.c
@@ -2,6 +2,7 @@ 
 #include "pack.h"
 #include "csum-file.h"
 #include "remote.h"
+#include "chunk-format.h"
 
 void reset_pack_idx_option(struct pack_idx_option *opts)
 {
@@ -181,21 +182,9 @@  static int pack_order_cmp(const void *va, const void *vb, void *ctx)
 
 static void write_rev_header(struct hashfile *f)
 {
-	uint32_t oid_version;
-	switch (hash_algo_by_ptr(the_hash_algo)) {
-	case GIT_HASH_SHA1:
-		oid_version = 1;
-		break;
-	case GIT_HASH_SHA256:
-		oid_version = 2;
-		break;
-	default:
-		die("write_rev_header: unknown hash version");
-	}
-
 	hashwrite_be32(f, RIDX_SIGNATURE);
 	hashwrite_be32(f, RIDX_VERSION);
-	hashwrite_be32(f, oid_version);
+	hashwrite_be32(f, oid_version(the_hash_algo));
 }
 
 static void write_rev_index_positions(struct hashfile *f,