diff mbox series

[08/15] cache: compare the entire buffer for struct object_id

Message ID 20210410152140.3525040-9-sandals@crustytoothpaste.net (mailing list archive)
State Superseded
Headers show
Series SHA-256 / SHA-1 interop, part 1 | expand

Commit Message

brian m. carlson April 10, 2021, 3:21 p.m. UTC
Currently, when we compare two object IDs, we have to take a branch to
determine what the hash size is supposed to be.  The compiler can
optimize well for a single length, but has trouble when there are two
possible lengths.

There is, however, an alternative: we can ensure that we always compare
the full length of the hash buffer, but in turn we must zero the
remainder of the buffer when using SHA-1; otherwise, we'll end up with
incompatible junk at the end of otherwise equivalent object IDs that
will prevent them from matching.  This is an acceptable tradeoff,
because we generally read an object ID in once, but then compare it
against others multiple times.

This latter approach also has some benefits as well: since we will have
annotated every location in which we load an object ID into an instance
of struct object_id, if we want to set the hash algorithm for the object
ID, we can do so at the same time.

Adopt this latter approach, since it provides us greater flexibility and
lets us read and store object IDs for multiple algorithms at once.

Signed-off-by: brian m. carlson <sandals@crustytoothpaste.net>
---
 hash.h        | 13 ++++++++++---
 hex.c         |  9 ++++++---
 notes.c       |  3 +++
 object-file.c |  1 +
 4 files changed, 20 insertions(+), 6 deletions(-)

Comments

Chris Torek April 11, 2021, 8:17 a.m. UTC | #1
Just an observation here: comparing 256 bytes every time
would seem to have one nice bonus side effect and one
potentially bad, but vanishingly unlikely, side effect: a 160
byte null hash will now compare equal to a 256 byte null
hash (good), but a 160 byte hash extended to 256 bytes
will compare equal to a 256 byte hash that just happens to
end in 96 bytes of zero (bad, but I would guess, will never
actually happen).

Chris

On Sat, Apr 10, 2021 at 8:23 AM brian m. carlson
<sandals@crustytoothpaste.net> wrote:
>
> Currently, when we compare two object IDs, we have to take a branch to
> determine what the hash size is supposed to be.  The compiler can
> optimize well for a single length, but has trouble when there are two
> possible lengths.
>
> There is, however, an alternative: we can ensure that we always compare
> the full length of the hash buffer, but in turn we must zero the
> remainder of the buffer when using SHA-1; otherwise, we'll end up with
> incompatible junk at the end of otherwise equivalent object IDs that
> will prevent them from matching.  This is an acceptable tradeoff,
> because we generally read an object ID in once, but then compare it
> against others multiple times.
>
> This latter approach also has some benefits as well: since we will have
> annotated every location in which we load an object ID into an instance
> of struct object_id, if we want to set the hash algorithm for the object
> ID, we can do so at the same time.
>
> Adopt this latter approach, since it provides us greater flexibility and
> lets us read and store object IDs for multiple algorithms at once.
>
> Signed-off-by: brian m. carlson <sandals@crustytoothpaste.net>
> ---
>  hash.h        | 13 ++++++++++---
>  hex.c         |  9 ++++++---
>  notes.c       |  3 +++
>  object-file.c |  1 +
>  4 files changed, 20 insertions(+), 6 deletions(-)
>
> diff --git a/hash.h b/hash.h
> index c8f03d8aee..04eba5c56b 100644
> --- a/hash.h
> +++ b/hash.h
> @@ -205,7 +205,7 @@ static inline int hashcmp(const unsigned char *sha1, const unsigned char *sha2)
>
>  static inline int oidcmp(const struct object_id *oid1, const struct object_id *oid2)
>  {
> -       return hashcmp(oid1->hash, oid2->hash);
> +       return memcmp(oid1->hash, oid2->hash, GIT_MAX_RAWSZ);
>  }
>
>  static inline int hasheq(const unsigned char *sha1, const unsigned char *sha2)
> @@ -221,7 +221,7 @@ static inline int hasheq(const unsigned char *sha1, const unsigned char *sha2)
>
>  static inline int oideq(const struct object_id *oid1, const struct object_id *oid2)
>  {
> -       return hasheq(oid1->hash, oid2->hash);
> +       return !memcmp(oid1->hash, oid2->hash, GIT_MAX_RAWSZ);
>  }
>
>  static inline int is_null_oid(const struct object_id *oid)
> @@ -258,7 +258,9 @@ static inline void oidclr(struct object_id *oid)
>
>  static inline void oidread(struct object_id *oid, const unsigned char *hash)
>  {
> -       memcpy(oid->hash, hash, the_hash_algo->rawsz);
> +       size_t rawsz = the_hash_algo->rawsz;
> +       memcpy(oid->hash, hash, rawsz);
> +       memset(oid->hash + rawsz, 0, GIT_MAX_RAWSZ - rawsz);
>  }
>
>  static inline int is_empty_blob_sha1(const unsigned char *sha1)
> @@ -281,6 +283,11 @@ static inline int is_empty_tree_oid(const struct object_id *oid)
>         return oideq(oid, the_hash_algo->empty_tree);
>  }
>
> +static inline void oid_pad_buffer(struct object_id *oid, const struct git_hash_algo *algop)
> +{
> +       memset(oid->hash + algop->rawsz, 0, GIT_MAX_RAWSZ - algop->rawsz);
> +}
> +
>  const char *empty_tree_oid_hex(void);
>  const char *empty_blob_oid_hex(void);
>
> diff --git a/hex.c b/hex.c
> index da51e64929..5fa3e71cb9 100644
> --- a/hex.c
> +++ b/hex.c
> @@ -69,7 +69,10 @@ int get_sha1_hex(const char *hex, unsigned char *sha1)
>  int get_oid_hex_algop(const char *hex, struct object_id *oid,
>                       const struct git_hash_algo *algop)
>  {
> -       return get_hash_hex_algop(hex, oid->hash, algop);
> +       int ret = get_hash_hex_algop(hex, oid->hash, algop);
> +       if (!ret)
> +               oid_pad_buffer(oid, algop);
> +       return ret;
>  }
>
>  /*
> @@ -80,7 +83,7 @@ int get_oid_hex_any(const char *hex, struct object_id *oid)
>  {
>         int i;
>         for (i = GIT_HASH_NALGOS - 1; i > 0; i--) {
> -               if (!get_hash_hex_algop(hex, oid->hash, &hash_algos[i]))
> +               if (!get_oid_hex_algop(hex, oid, &hash_algos[i]))
>                         return i;
>         }
>         return GIT_HASH_UNKNOWN;
> @@ -95,7 +98,7 @@ int parse_oid_hex_algop(const char *hex, struct object_id *oid,
>                         const char **end,
>                         const struct git_hash_algo *algop)
>  {
> -       int ret = get_hash_hex_algop(hex, oid->hash, algop);
> +       int ret = get_oid_hex_algop(hex, oid, algop);
>         if (!ret)
>                 *end = hex + algop->hexsz;
>         return ret;
> diff --git a/notes.c b/notes.c
> index a44b25858f..1dfe9e2b9f 100644
> --- a/notes.c
> +++ b/notes.c
> @@ -455,6 +455,8 @@ static void load_subtree(struct notes_tree *t, struct leaf_node *subtree,
>                 CALLOC_ARRAY(l, 1);
>                 oidcpy(&l->key_oid, &object_oid);
>                 oidcpy(&l->val_oid, &entry.oid);
> +               oid_pad_buffer(&l->key_oid, the_hash_algo);
> +               oid_pad_buffer(&l->val_oid, the_hash_algo);
>                 if (note_tree_insert(t, node, n, l, type,
>                                      combine_notes_concatenate))
>                         die("Failed to load %s %s into notes tree "
> @@ -484,6 +486,7 @@ static void load_subtree(struct notes_tree *t, struct leaf_node *subtree,
>                                 strbuf_addch(&non_note_path, '/');
>                         }
>                         strbuf_addstr(&non_note_path, entry.path);
> +                       oid_pad_buffer(&entry.oid, the_hash_algo);
>                         add_non_note(t, strbuf_detach(&non_note_path, NULL),
>                                      entry.mode, entry.oid.hash);
>                 }
> diff --git a/object-file.c b/object-file.c
> index 3f43c376e7..8e338247cc 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -2352,6 +2352,7 @@ int for_each_file_in_obj_subdir(unsigned int subdir_nr,
>                 if (namelen == the_hash_algo->hexsz - 2 &&
>                     !hex_to_bytes(oid.hash + 1, de->d_name,
>                                   the_hash_algo->rawsz - 1)) {
> +                       oid_pad_buffer(&oid, the_hash_algo);
>                         if (obj_cb) {
>                                 r = obj_cb(&oid, path->buf, data);
>                                 if (r)
Ævar Arnfjörð Bjarmason April 11, 2021, 11:36 a.m. UTC | #2
On Sat, Apr 10 2021, brian m. carlson wrote:

> Currently, when we compare two object IDs, we have to take a branch to
> determine what the hash size is supposed to be.  The compiler can
> optimize well for a single length, but has trouble when there are two
> possible lengths.

This would benefit from some performance/perf numbers. When this code
was first changed like this in 183a638b7da (hashcmp: assert constant
hash size, 2018-08-23) we had:

      Test     v2.18.0             v2.19.0-rc0               HEAD
      ------------------------------------------------------------------------------
      0001.2:  34.24(33.81+0.43)   34.83(34.42+0.40) +1.7%   33.90(33.47+0.42) -1.0%

Then it was later modified in 0dab7129ab1 (cache: make hashcmp and
hasheq work with larger hashes, 2018-11-14).

> @@ -205,7 +205,7 @@ static inline int hashcmp(const unsigned char *sha1, const unsigned char *sha2)
>  
>  static inline int oidcmp(const struct object_id *oid1, const struct object_id *oid2)
>  {
> -	return hashcmp(oid1->hash, oid2->hash);
> +	return memcmp(oid1->hash, oid2->hash, GIT_MAX_RAWSZ);
>  }

hashcmp is now:

        if (the_hash_algo->rawsz == GIT_MAX_RAWSZ)
                return memcmp(sha1, sha2, GIT_MAX_RAWSZ);
        return memcmp(sha1, sha2, GIT_SHA1_RAWSZ);

Wouldn't it make more sense to amend it to just be a memcmp
wrapper/macro if we're going to not make this conditional on the hash
algorithm, or are there other callsites where we still want the old way
of doing it?

>  
>  static inline int hasheq(const unsigned char *sha1, const unsigned char *sha2)
> @@ -221,7 +221,7 @@ static inline int hasheq(const unsigned char *sha1, const unsigned char *sha2)
>  
>  static inline int oideq(const struct object_id *oid1, const struct object_id *oid2)
>  {
> -	return hasheq(oid1->hash, oid2->hash);
> +	return !memcmp(oid1->hash, oid2->hash, GIT_MAX_RAWSZ);
>  }

Ditto hasheq v.s. !memcmp:

        if (the_hash_algo->rawsz == GIT_MAX_RAWSZ)
                return !memcmp(sha1, sha2, GIT_MAX_RAWSZ);
        return !memcmp(sha1, sha2, GIT_SHA1_RAWSZ);
brian m. carlson April 11, 2021, 9:05 p.m. UTC | #3
On 2021-04-11 at 11:36:33, Ævar Arnfjörð Bjarmason wrote:
> 
> On Sat, Apr 10 2021, brian m. carlson wrote:
> 
> > Currently, when we compare two object IDs, we have to take a branch to
> > determine what the hash size is supposed to be.  The compiler can
> > optimize well for a single length, but has trouble when there are two
> > possible lengths.
> 
> This would benefit from some performance/perf numbers. When this code
> was first changed like this in 183a638b7da (hashcmp: assert constant
> hash size, 2018-08-23) we had:
> 
>       Test     v2.18.0             v2.19.0-rc0               HEAD
>       ------------------------------------------------------------------------------
>       0001.2:  34.24(33.81+0.43)   34.83(34.42+0.40) +1.7%   33.90(33.47+0.42) -1.0%
> 
> Then it was later modified in 0dab7129ab1 (cache: make hashcmp and
> hasheq work with larger hashes, 2018-11-14).

I can do some perf numbers.

> > @@ -205,7 +205,7 @@ static inline int hashcmp(const unsigned char *sha1, const unsigned char *sha2)
> >  
> >  static inline int oidcmp(const struct object_id *oid1, const struct object_id *oid2)
> >  {
> > -	return hashcmp(oid1->hash, oid2->hash);
> > +	return memcmp(oid1->hash, oid2->hash, GIT_MAX_RAWSZ);
> >  }
> 
> hashcmp is now:
> 
>         if (the_hash_algo->rawsz == GIT_MAX_RAWSZ)
>                 return memcmp(sha1, sha2, GIT_MAX_RAWSZ);
>         return memcmp(sha1, sha2, GIT_SHA1_RAWSZ);
> 
> Wouldn't it make more sense to amend it to just be a memcmp
> wrapper/macro if we're going to not make this conditional on the hash
> algorithm, or are there other callsites where we still want the old way
> of doing it?

No, we can't do that.  With oidcmp, we know the buffer is large enough.
However, in some cases, the buffer in hashcmp is not large enough.  For
example, we may be at the end of a SHA-1 tree object and we'd segfault.
I did try that and I quickly found that it was totally broken.
diff mbox series

Patch

diff --git a/hash.h b/hash.h
index c8f03d8aee..04eba5c56b 100644
--- a/hash.h
+++ b/hash.h
@@ -205,7 +205,7 @@  static inline int hashcmp(const unsigned char *sha1, const unsigned char *sha2)
 
 static inline int oidcmp(const struct object_id *oid1, const struct object_id *oid2)
 {
-	return hashcmp(oid1->hash, oid2->hash);
+	return memcmp(oid1->hash, oid2->hash, GIT_MAX_RAWSZ);
 }
 
 static inline int hasheq(const unsigned char *sha1, const unsigned char *sha2)
@@ -221,7 +221,7 @@  static inline int hasheq(const unsigned char *sha1, const unsigned char *sha2)
 
 static inline int oideq(const struct object_id *oid1, const struct object_id *oid2)
 {
-	return hasheq(oid1->hash, oid2->hash);
+	return !memcmp(oid1->hash, oid2->hash, GIT_MAX_RAWSZ);
 }
 
 static inline int is_null_oid(const struct object_id *oid)
@@ -258,7 +258,9 @@  static inline void oidclr(struct object_id *oid)
 
 static inline void oidread(struct object_id *oid, const unsigned char *hash)
 {
-	memcpy(oid->hash, hash, the_hash_algo->rawsz);
+	size_t rawsz = the_hash_algo->rawsz;
+	memcpy(oid->hash, hash, rawsz);
+	memset(oid->hash + rawsz, 0, GIT_MAX_RAWSZ - rawsz);
 }
 
 static inline int is_empty_blob_sha1(const unsigned char *sha1)
@@ -281,6 +283,11 @@  static inline int is_empty_tree_oid(const struct object_id *oid)
 	return oideq(oid, the_hash_algo->empty_tree);
 }
 
+static inline void oid_pad_buffer(struct object_id *oid, const struct git_hash_algo *algop)
+{
+	memset(oid->hash + algop->rawsz, 0, GIT_MAX_RAWSZ - algop->rawsz);
+}
+
 const char *empty_tree_oid_hex(void);
 const char *empty_blob_oid_hex(void);
 
diff --git a/hex.c b/hex.c
index da51e64929..5fa3e71cb9 100644
--- a/hex.c
+++ b/hex.c
@@ -69,7 +69,10 @@  int get_sha1_hex(const char *hex, unsigned char *sha1)
 int get_oid_hex_algop(const char *hex, struct object_id *oid,
 		      const struct git_hash_algo *algop)
 {
-	return get_hash_hex_algop(hex, oid->hash, algop);
+	int ret = get_hash_hex_algop(hex, oid->hash, algop);
+	if (!ret)
+		oid_pad_buffer(oid, algop);
+	return ret;
 }
 
 /*
@@ -80,7 +83,7 @@  int get_oid_hex_any(const char *hex, struct object_id *oid)
 {
 	int i;
 	for (i = GIT_HASH_NALGOS - 1; i > 0; i--) {
-		if (!get_hash_hex_algop(hex, oid->hash, &hash_algos[i]))
+		if (!get_oid_hex_algop(hex, oid, &hash_algos[i]))
 			return i;
 	}
 	return GIT_HASH_UNKNOWN;
@@ -95,7 +98,7 @@  int parse_oid_hex_algop(const char *hex, struct object_id *oid,
 			const char **end,
 			const struct git_hash_algo *algop)
 {
-	int ret = get_hash_hex_algop(hex, oid->hash, algop);
+	int ret = get_oid_hex_algop(hex, oid, algop);
 	if (!ret)
 		*end = hex + algop->hexsz;
 	return ret;
diff --git a/notes.c b/notes.c
index a44b25858f..1dfe9e2b9f 100644
--- a/notes.c
+++ b/notes.c
@@ -455,6 +455,8 @@  static void load_subtree(struct notes_tree *t, struct leaf_node *subtree,
 		CALLOC_ARRAY(l, 1);
 		oidcpy(&l->key_oid, &object_oid);
 		oidcpy(&l->val_oid, &entry.oid);
+		oid_pad_buffer(&l->key_oid, the_hash_algo);
+		oid_pad_buffer(&l->val_oid, the_hash_algo);
 		if (note_tree_insert(t, node, n, l, type,
 				     combine_notes_concatenate))
 			die("Failed to load %s %s into notes tree "
@@ -484,6 +486,7 @@  static void load_subtree(struct notes_tree *t, struct leaf_node *subtree,
 				strbuf_addch(&non_note_path, '/');
 			}
 			strbuf_addstr(&non_note_path, entry.path);
+			oid_pad_buffer(&entry.oid, the_hash_algo);
 			add_non_note(t, strbuf_detach(&non_note_path, NULL),
 				     entry.mode, entry.oid.hash);
 		}
diff --git a/object-file.c b/object-file.c
index 3f43c376e7..8e338247cc 100644
--- a/object-file.c
+++ b/object-file.c
@@ -2352,6 +2352,7 @@  int for_each_file_in_obj_subdir(unsigned int subdir_nr,
 		if (namelen == the_hash_algo->hexsz - 2 &&
 		    !hex_to_bytes(oid.hash + 1, de->d_name,
 				  the_hash_algo->rawsz - 1)) {
+			oid_pad_buffer(&oid, the_hash_algo);
 			if (obj_cb) {
 				r = obj_cb(&oid, path->buf, data);
 				if (r)