diff mbox series

[03/10] fast-export: store anonymized oids as hex strings

Message ID 20200623152451.GC1435482@coredump.intra.peff.net (mailing list archive)
State New, archived
Headers show
Series fast-export: allow seeding the anonymized mapping | expand

Commit Message

Jeff King June 23, 2020, 3:24 p.m. UTC
When fast-export stores anonymized oids, it does so as binary strings.
And while the anonymous mapping storage is binary-clean (at least as of
the previous commit), this will become awkward when we start exposing
more of it to the user. In particular, if we allow a method for
retaining token "foo", then users may want to specify a hex oid as such
a token.

Let's just switch to storing the hex strings. The difference in memory
usage is negligible (especially considering how infrequently we'd
generally store an oid compared to, say, path components).

Signed-off-by: Jeff King <peff@peff.net>
---
 builtin/fast-export.c | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

Comments

SZEDER Gábor June 24, 2020, 11:43 a.m. UTC | #1
On Tue, Jun 23, 2020 at 11:24:51AM -0400, Jeff King wrote:

> diff --git a/builtin/fast-export.c b/builtin/fast-export.c
> index 289395a131..4a3a4c933e 100644
> --- a/builtin/fast-export.c
> +++ b/builtin/fast-export.c
> @@ -387,16 +387,19 @@ static void *generate_fake_oid(const void *old, size_t *len)
>  {
>  	static uint32_t counter = 1; /* avoid null oid */
>  	const unsigned hashsz = the_hash_algo->rawsz;
> -	unsigned char *out = xcalloc(hashsz, 1);
> -	put_be32(out + hashsz - 4, counter++);
> -	return out;
> +	struct object_id oid;
> +	char *hex = xmallocz(GIT_MAX_HEXSZ);
> +
> +	oidclr(&oid);
> +	put_be32(oid.hash + hashsz - 4, counter++);
> +	return oid_to_hex_r(hex, &oid);
>  }

GCC 4.8 complains about this change in our CI job:

  $ make CC=gcc-4.8 builtin/fast-export.o
      CC builtin/fast-export.o
  builtin/fast-export.c: In function ‘generate_fake_oid’:
  builtin/fast-export.c:394:2: error: dereferencing type-punned pointer will break strict-aliasing rules [-Werror=strict-aliasing]
    put_be32(oid.hash + hashsz - 4, counter++);
    ^
  cc1: all warnings being treated as errors
Jeff King June 24, 2020, 3:54 p.m. UTC | #2
On Wed, Jun 24, 2020 at 01:43:13PM +0200, SZEDER Gábor wrote:

> >  	static uint32_t counter = 1; /* avoid null oid */
> >  	const unsigned hashsz = the_hash_algo->rawsz;
> > -	unsigned char *out = xcalloc(hashsz, 1);
> > -	put_be32(out + hashsz - 4, counter++);
> > -	return out;
> > +	struct object_id oid;
> > +	char *hex = xmallocz(GIT_MAX_HEXSZ);
> > +
> > +	oidclr(&oid);
> > +	put_be32(oid.hash + hashsz - 4, counter++);
> > +	return oid_to_hex_r(hex, &oid);
> >  }
> 
> GCC 4.8 complains about this change in our CI job:
> 
>   $ make CC=gcc-4.8 builtin/fast-export.o
>       CC builtin/fast-export.o
>   builtin/fast-export.c: In function ‘generate_fake_oid’:
>   builtin/fast-export.c:394:2: error: dereferencing type-punned pointer will break strict-aliasing rules [-Werror=strict-aliasing]
>     put_be32(oid.hash + hashsz - 4, counter++);
>     ^
>   cc1: all warnings being treated as errors

Interesting. The only change on this line is swapping out the local
"unsigned char *" for an object_id, which also stores an "unsigned
char" (though as an array). And while put_be32() takes a void pointer,
it's inlined and treats it the argument an "unsigned char *" internally.
So I'm not sure I see that any type punning is going on at all.

I'll see if I can appease the compiler, though.

-Peff
Jeff King June 25, 2020, 3:49 p.m. UTC | #3
On Wed, Jun 24, 2020 at 11:54:20AM -0400, Jeff King wrote:

> > GCC 4.8 complains about this change in our CI job:
> > 
> >   $ make CC=gcc-4.8 builtin/fast-export.o
> >       CC builtin/fast-export.o
> >   builtin/fast-export.c: In function ‘generate_fake_oid’:
> >   builtin/fast-export.c:394:2: error: dereferencing type-punned pointer will break strict-aliasing rules [-Werror=strict-aliasing]
> >     put_be32(oid.hash + hashsz - 4, counter++);
> >     ^
> >   cc1: all warnings being treated as errors
> 
> Interesting. The only change on this line is swapping out the local
> "unsigned char *" for an object_id, which also stores an "unsigned
> char" (though as an array). And while put_be32() takes a void pointer,
> it's inlined and treats it the argument an "unsigned char *" internally.
> So I'm not sure I see that any type punning is going on at all.
> 
> I'll see if I can appease the compiler, though.

Hmm. Switching to a local array makes the warning go away:

diff --git a/builtin/fast-export.c b/builtin/fast-export.c
index 4a3a4c93..eae2ec5 100644
--- a/builtin/fast-export.c
+++ b/builtin/fast-export.c
@@ -387,12 +387,12 @@ static void *generate_fake_oid(const void *old, size_t *len)
 {
 	static uint32_t counter = 1; /* avoid null oid */
 	const unsigned hashsz = the_hash_algo->rawsz;
-	struct object_id oid;
+	unsigned char out[GIT_MAX_RAWSZ];
 	char *hex = xmallocz(GIT_MAX_HEXSZ);
 
-	oidclr(&oid);
-	put_be32(oid.hash + hashsz - 4, counter++);
-	return oid_to_hex_r(hex, &oid);
+	hashclr(out);
+	put_be32(out + hashsz - 4, counter++);
+	return hash_to_hex_algop_r(hex, out, the_hash_algo);
 }
 
 static const char *anonymize_oid(const char *oid_hex)

at the cost of some uglier use of the_hash_algo and RAWSZ. But
curiously, even with the array, if I adjust the write only slightly to
write at the begining instead of the end (we don't really care where our
counter appears in the hash, since the point is to anonymize):

diff --git a/builtin/fast-export.c b/builtin/fast-export.c
index eae2ec5..1f52247 100644
--- a/builtin/fast-export.c
+++ b/builtin/fast-export.c
@@ -386,12 +386,11 @@ static void print_path(const char *path)
 static void *generate_fake_oid(const void *old, size_t *len)
 {
 	static uint32_t counter = 1; /* avoid null oid */
-	const unsigned hashsz = the_hash_algo->rawsz;
 	unsigned char out[GIT_MAX_RAWSZ];
 	char *hex = xmallocz(GIT_MAX_HEXSZ);
 
 	hashclr(out);
-	put_be32(out + hashsz - 4, counter++);
+	put_be32(out, counter++);
 	return hash_to_hex_algop_r(hex, out, the_hash_algo);
 }
 

...then the warning comes back. I tried that because I was wondering
whether the same thing might make the warning go away on the object_id
version, but it doesn't.

So this really seems like a pointless false positive from the compiler,
and it's a rather old compiler (the warning no longer triggers in gcc 6
and up). Is it worth caring about? Ubuntu Trusty is out of standard
support but not fully EOL'd until 2022. Debian jessie has gcc 4.9
which triggers the warning, but will hit EOL in 5 days. If it were an
actual breakage I'd be more concerned, but keeping such an old compiler
-Werror clean doesn't seem that important.

I'd note also that none of the Actions CI jobs run with this compiler
version. If we _do_ want to care about it, it might be worth covering it
there.

-Peff
SZEDER Gábor June 25, 2020, 8:45 p.m. UTC | #4
On Thu, Jun 25, 2020 at 11:49:48AM -0400, Jeff King wrote:
> On Wed, Jun 24, 2020 at 11:54:20AM -0400, Jeff King wrote:
> 
> > > GCC 4.8 complains about this change in our CI job:
> > > 
> > >   $ make CC=gcc-4.8 builtin/fast-export.o
> > >       CC builtin/fast-export.o
> > >   builtin/fast-export.c: In function ‘generate_fake_oid’:
> > >   builtin/fast-export.c:394:2: error: dereferencing type-punned pointer will break strict-aliasing rules [-Werror=strict-aliasing]
> > >     put_be32(oid.hash + hashsz - 4, counter++);
> > >     ^
> > >   cc1: all warnings being treated as errors


> So this really seems like a pointless false positive from the compiler,
> and it's a rather old compiler (the warning no longer triggers in gcc 6
> and up). Is it worth caring about? Ubuntu Trusty is out of standard
> support but not fully EOL'd until 2022. Debian jessie has gcc 4.9
> which triggers the warning, but will hit EOL in 5 days. If it were an
> actual breakage I'd be more concerned, but keeping such an old compiler
> -Werror clean doesn't seem that important.
> 
> I'd note also that none of the Actions CI jobs run with this compiler
> version. If we _do_ want to care about it, it might be worth covering it
> there.

C99 style 'for' loop initial declarations are still frowned upon in
Git's codebase, and as far as we know it GCC 4.8 is the the most
recent compiler that can reasonably detect it; see fb9d7431cf
(travis-ci: build with GCC 4.8 as well, 2019-07-18).
Jeff King June 25, 2020, 9:15 p.m. UTC | #5
On Thu, Jun 25, 2020 at 10:45:32PM +0200, SZEDER Gábor wrote:

> > So this really seems like a pointless false positive from the compiler,
> > and it's a rather old compiler (the warning no longer triggers in gcc 6
> > and up). Is it worth caring about? Ubuntu Trusty is out of standard
> > support but not fully EOL'd until 2022. Debian jessie has gcc 4.9
> > which triggers the warning, but will hit EOL in 5 days. If it were an
> > actual breakage I'd be more concerned, but keeping such an old compiler
> > -Werror clean doesn't seem that important.
> > 
> > I'd note also that none of the Actions CI jobs run with this compiler
> > version. If we _do_ want to care about it, it might be worth covering it
> > there.
> 
> C99 style 'for' loop initial declarations are still frowned upon in
> Git's codebase, and as far as we know it GCC 4.8 is the the most
> recent compiler that can reasonably detect it; see fb9d7431cf
> (travis-ci: build with GCC 4.8 as well, 2019-07-18).

TBH, that does not seem like that compelling a reason to me to keep it
around. If no compiler is complaining of C99 for-loop declarations, then
maybe we should consider loosening our style. Or are we trying to be
kind of some unknown set of platform-specific compilers that we can't
feasibly hit in our CI?

I also note that fb9d7431cf mentions that -std=c90 doesn't work, because
there are other spots where we violate the standard (looks like "inline"
is a big one). But gcc with -std=gnu89 seems to compile just fine for
me, and does notice for-loop declarations. That's obviously totally
unportable, but it would be sufficient for a CI test.

-Peff
Johannes Schindelin June 29, 2020, 1:17 p.m. UTC | #6
Hi Peff & Gábor,

On Thu, 25 Jun 2020, Jeff King wrote:

> On Thu, Jun 25, 2020 at 10:45:32PM +0200, SZEDER Gábor wrote:
>
> > > So this really seems like a pointless false positive from the compiler,
> > > and it's a rather old compiler (the warning no longer triggers in gcc 6
> > > and up). Is it worth caring about? Ubuntu Trusty is out of standard
> > > support but not fully EOL'd until 2022. Debian jessie has gcc 4.9
> > > which triggers the warning, but will hit EOL in 5 days. If it were an
> > > actual breakage I'd be more concerned, but keeping such an old compiler
> > > -Werror clean doesn't seem that important.
> > >
> > > I'd note also that none of the Actions CI jobs run with this compiler
> > > version. If we _do_ want to care about it, it might be worth covering it
> > > there.
> >
> > C99 style 'for' loop initial declarations are still frowned upon in
> > Git's codebase, and as far as we know it GCC 4.8 is the the most
> > recent compiler that can reasonably detect it; see fb9d7431cf
> > (travis-ci: build with GCC 4.8 as well, 2019-07-18).
>
> TBH, that does not seem like that compelling a reason to me to keep it
> around. If no compiler is complaining of C99 for-loop declarations, then
> maybe we should consider loosening our style. Or are we trying to be
> kind of some unknown set of platform-specific compilers that we can't
> feasibly hit in our CI?

FWIW _iff_ we decide to loosen our style, I would like to propose doing it
in one place first, and keep it that way for two or three major versions.

Traditionally, people stuck on platforms such as IRIX or HP/UX with
proprietary C compilers (and remember: I once was one of those people)
often lack the luxury of upgrading frequently. And if it turns out that we
want to revert the style-loosening, it is much easier to do it in one
place than in many.

Ciao,
Dscho
Jeff King June 30, 2020, 7:35 p.m. UTC | #7
On Mon, Jun 29, 2020 at 03:17:00PM +0200, Johannes Schindelin wrote:

> > TBH, that does not seem like that compelling a reason to me to keep it
> > around. If no compiler is complaining of C99 for-loop declarations, then
> > maybe we should consider loosening our style. Or are we trying to be
> > kind of some unknown set of platform-specific compilers that we can't
> > feasibly hit in our CI?
> 
> FWIW _iff_ we decide to loosen our style, I would like to propose doing it
> in one place first, and keep it that way for two or three major versions.
> 
> Traditionally, people stuck on platforms such as IRIX or HP/UX with
> proprietary C compilers (and remember: I once was one of those people)
> often lack the luxury of upgrading frequently. And if it turns out that we
> want to revert the style-loosening, it is much easier to do it in one
> place than in many.

Yeah, I definitely agree that would be the way to do it. I admit I don't
even _really_ care that much about allowing the feature. More that it
might not be worth trying to crack down on it so aggressively, and
just letting it get picked up by review (or if it slips through, letting
the poor souls stuck on HP/UX complain).

(And I say "worth" because now the use of gcc 4.8 as the checking tool
has demonstrably cost a bunch of human time, so it comes with a cost).

-Peff
diff mbox series

Patch

diff --git a/builtin/fast-export.c b/builtin/fast-export.c
index 289395a131..4a3a4c933e 100644
--- a/builtin/fast-export.c
+++ b/builtin/fast-export.c
@@ -387,16 +387,19 @@  static void *generate_fake_oid(const void *old, size_t *len)
 {
 	static uint32_t counter = 1; /* avoid null oid */
 	const unsigned hashsz = the_hash_algo->rawsz;
-	unsigned char *out = xcalloc(hashsz, 1);
-	put_be32(out + hashsz - 4, counter++);
-	return out;
+	struct object_id oid;
+	char *hex = xmallocz(GIT_MAX_HEXSZ);
+
+	oidclr(&oid);
+	put_be32(oid.hash + hashsz - 4, counter++);
+	return oid_to_hex_r(hex, &oid);
 }
 
-static const struct object_id *anonymize_oid(const struct object_id *oid)
+static const char *anonymize_oid(const char *oid_hex)
 {
 	static struct hashmap objs;
-	size_t len = the_hash_algo->rawsz;
-	return anonymize_mem(&objs, generate_fake_oid, oid, &len);
+	size_t len = strlen(oid_hex);
+	return anonymize_mem(&objs, generate_fake_oid, oid_hex, &len);
 }
 
 static void show_filemodify(struct diff_queue_struct *q,
@@ -455,9 +458,9 @@  static void show_filemodify(struct diff_queue_struct *q,
 			 */
 			if (no_data || S_ISGITLINK(spec->mode))
 				printf("M %06o %s ", spec->mode,
-				       oid_to_hex(anonymize ?
-						  anonymize_oid(&spec->oid) :
-						  &spec->oid));
+				       anonymize ?
+				       anonymize_oid(oid_to_hex(&spec->oid)) :
+				       oid_to_hex(&spec->oid));
 			else {
 				struct object *object = lookup_object(the_repository,
 								      &spec->oid);
@@ -712,9 +715,10 @@  static void handle_commit(struct commit *commit, struct rev_info *rev,
 		if (mark)
 			printf(":%d\n", mark);
 		else
-			printf("%s\n", oid_to_hex(anonymize ?
-						  anonymize_oid(&obj->oid) :
-						  &obj->oid));
+			printf("%s\n",
+			       anonymize ?
+			       anonymize_oid(oid_to_hex(&obj->oid)) :
+			       oid_to_hex(&obj->oid));
 		i++;
 	}