diff mbox series

[v2,02/13] msvc: avoid using minus operator on unsigned types

Message ID 8800320590e4d7218a80f80abca23a7f44b8747d.1569837329.git.gitgitgadget@gmail.com (mailing list archive)
State New, archived
Headers show
Series ci: include a Visual Studio build & test in our Azure Pipeline | expand

Commit Message

Johannes Schindelin via GitGitGadget Sept. 30, 2019, 9:55 a.m. UTC
From: Johannes Schindelin <johannes.schindelin@gmx.de>

MSVC complains about this with `-Wall`, which can be taken as a sign
that this is indeed a real bug. The symptom is:

	C4146: unary minus operator applied to unsigned type, result
	still unsigned

Let's avoid this warning in the minimal way, e.g. writing `-1 -
<unsigned value>` instead of `-<unsigned value> - 1`.

Note that the change in the `estimate_cache_size()` function is
needed because MSVC considers the "return type" of the `sizeof()`
operator to be `size_t`, i.e. unsigned, and therefore it cannot be
negated using the unary minus operator.

Even worse, that arithmetic is doing extra work, in vain. We want to
calculate the entry extra cache size as the difference between the
size of the `cache_entry` structure minus the size of the
`ondisk_cache_entry` structure, padded to the appropriate alignment
boundary.

To that end, we start by assigning that difference to the `per_entry`
variable, and then abuse the `len` parameter of the
`align_padding_size()` macro to take the negative size of the ondisk
entry size. Essentially, we try to avoid passing the already calculated
difference to that macro by passing the operands of that difference
instead, when the macro expects operands of an addition:

	#define align_padding_size(size, len) \
		((size + (len) + 8) & ~7) - (size + len)

Currently, we pass A and -B to that macro instead of passing A - B and
0, where A - B is already stored in the `per_entry` variable, ready to
be used.

This is neither necessary, nor intuitive. Let's fix this, and have code
that is both easier to read and that also does not trigger MSVC's
warning.

While at it, we take care of reporting overflows (which are unlikely,
but hey, defensive programming is good!).

We _also_ take pains of casting the unsigned value to signed: otherwise,
the signed operand (i.e. the `-1`) would be cast to unsigned before
doing the arithmetic.

Helped-by: Denton Liu <liu.denton@gmail.com>
Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
---
 read-cache.c  |  9 ++++++---
 sha1-lookup.c | 12 +++++++++---
 2 files changed, 15 insertions(+), 6 deletions(-)

Comments

Junio C Hamano Oct. 3, 2019, 10:44 p.m. UTC | #1
"Johannes Schindelin via GitGitGadget" <gitgitgadget@gmail.com>
writes:

> While at it, we take care of reporting overflows (which are unlikely,
> but hey, defensive programming is good!).
>
> We _also_ take pains of casting the unsigned value to signed: otherwise,
> the signed operand (i.e. the `-1`) would be cast to unsigned before
> doing the arithmetic.

These three look good and too similar to each other, which makes me
wonder if we want to allow them simply write

	return insert_pos_as_negative_offset(nr);

with something like

	static int insert_pos_as_negative_offset(uintmax_t nr)
	{
		if (INT_MAX < nr)
			die("overflow: -1 - %"PRIuMAX, nr);
		return -1 - (int)nr;
	}

to avoid repetition.

> Helped-by: Denton Liu <liu.denton@gmail.com>
> Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
> ---
>  read-cache.c  |  9 ++++++---
>  sha1-lookup.c | 12 +++++++++---
>  2 files changed, 15 insertions(+), 6 deletions(-)
>
> diff --git a/read-cache.c b/read-cache.c
> index c701f7f8b8..97745c2a31 100644
> --- a/read-cache.c
> +++ b/read-cache.c
> @@ -1275,8 +1275,11 @@ static int add_index_entry_with_check(struct index_state *istate, struct cache_e
>  	 * we can avoid searching for it.
>  	 */
>  	if (istate->cache_nr > 0 &&
> -		strcmp(ce->name, istate->cache[istate->cache_nr - 1]->name) > 0)
> -		pos = -istate->cache_nr - 1;
> +		strcmp(ce->name, istate->cache[istate->cache_nr - 1]->name) > 0) {
> +		if (istate->cache_nr > INT_MAX)
> +			die("overflow: -1 - %u", istate->cache_nr);
> +		pos = -1 - (int)istate->cache_nr;
> +	}
>  	else
>  		pos = index_name_stage_pos(istate, ce->name, ce_namelen(ce), ce_stage(ce));
>  
> @@ -1894,7 +1897,7 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
>  	/*
>  	 * Account for potential alignment differences.
>  	 */
> -	per_entry += align_padding_size(sizeof(struct cache_entry), -sizeof(struct ondisk_cache_entry));
> +	per_entry += align_padding_size(per_entry, 0);
>  	return ondisk_size + entries * per_entry;
>  }
>  
> diff --git a/sha1-lookup.c b/sha1-lookup.c
> index 796ab68da8..bb786b5420 100644
> --- a/sha1-lookup.c
> +++ b/sha1-lookup.c
> @@ -69,8 +69,12 @@ int sha1_pos(const unsigned char *sha1, void *table, size_t nr,
>  			miv = take2(sha1 + ofs);
>  			if (miv < lov)
>  				return -1;
> -			if (hiv < miv)
> -				return -1 - nr;
> +			if (hiv < miv) {
> +				if (nr > INT_MAX)
> +					die("overflow: -1 - %"PRIuMAX,
> +					    (uintmax_t)nr);
> +				return -1 - (int)nr;
> +			}
>  			if (lov != hiv) {
>  				/*
>  				 * At this point miv could be equal
> @@ -97,7 +101,9 @@ int sha1_pos(const unsigned char *sha1, void *table, size_t nr,
>  			lo = mi + 1;
>  		mi = lo + (hi - lo) / 2;
>  	} while (lo < hi);
> -	return -lo-1;
> +	if (nr > INT_MAX)
> +		die("overflow: -1 - %"PRIuMAX, (uintmax_t)lo);
> +	return -1 - (int)lo;
>  }
>  
>  int bsearch_hash(const unsigned char *sha1, const uint32_t *fanout_nbo,
Johannes Schindelin Oct. 4, 2019, 9:55 a.m. UTC | #2
Hi Junio,

On Fri, 4 Oct 2019, Junio C Hamano wrote:

> "Johannes Schindelin via GitGitGadget" <gitgitgadget@gmail.com>
> writes:
>
> > While at it, we take care of reporting overflows (which are unlikely,
> > but hey, defensive programming is good!).
> >
> > We _also_ take pains of casting the unsigned value to signed: otherwise,
> > the signed operand (i.e. the `-1`) would be cast to unsigned before
> > doing the arithmetic.
>
> These three look good and too similar to each other, which makes me
> wonder if we want to allow them simply write
>
> 	return insert_pos_as_negative_offset(nr);
>
> with something like
>
> 	static int insert_pos_as_negative_offset(uintmax_t nr)
> 	{
> 		if (INT_MAX < nr)
> 			die("overflow: -1 - %"PRIuMAX, nr);
> 		return -1 - (int)nr;
> 	}
>
> to avoid repetition.

I tried not to do that because there are two different data types in
play: `unsigned int` and `size_t`. But I guess by making this an
`inline` function, compilers can optimize for the common case and avoid
casting _twice_.

Will be fixed in v2,
Dscho

>
> > Helped-by: Denton Liu <liu.denton@gmail.com>
> > Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
> > ---
> >  read-cache.c  |  9 ++++++---
> >  sha1-lookup.c | 12 +++++++++---
> >  2 files changed, 15 insertions(+), 6 deletions(-)
> >
> > diff --git a/read-cache.c b/read-cache.c
> > index c701f7f8b8..97745c2a31 100644
> > --- a/read-cache.c
> > +++ b/read-cache.c
> > @@ -1275,8 +1275,11 @@ static int add_index_entry_with_check(struct index_state *istate, struct cache_e
> >  	 * we can avoid searching for it.
> >  	 */
> >  	if (istate->cache_nr > 0 &&
> > -		strcmp(ce->name, istate->cache[istate->cache_nr - 1]->name) > 0)
> > -		pos = -istate->cache_nr - 1;
> > +		strcmp(ce->name, istate->cache[istate->cache_nr - 1]->name) > 0) {
> > +		if (istate->cache_nr > INT_MAX)
> > +			die("overflow: -1 - %u", istate->cache_nr);
> > +		pos = -1 - (int)istate->cache_nr;
> > +	}
> >  	else
> >  		pos = index_name_stage_pos(istate, ce->name, ce_namelen(ce), ce_stage(ce));
> >
> > @@ -1894,7 +1897,7 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
> >  	/*
> >  	 * Account for potential alignment differences.
> >  	 */
> > -	per_entry += align_padding_size(sizeof(struct cache_entry), -sizeof(struct ondisk_cache_entry));
> > +	per_entry += align_padding_size(per_entry, 0);
> >  	return ondisk_size + entries * per_entry;
> >  }
> >
> > diff --git a/sha1-lookup.c b/sha1-lookup.c
> > index 796ab68da8..bb786b5420 100644
> > --- a/sha1-lookup.c
> > +++ b/sha1-lookup.c
> > @@ -69,8 +69,12 @@ int sha1_pos(const unsigned char *sha1, void *table, size_t nr,
> >  			miv = take2(sha1 + ofs);
> >  			if (miv < lov)
> >  				return -1;
> > -			if (hiv < miv)
> > -				return -1 - nr;
> > +			if (hiv < miv) {
> > +				if (nr > INT_MAX)
> > +					die("overflow: -1 - %"PRIuMAX,
> > +					    (uintmax_t)nr);
> > +				return -1 - (int)nr;
> > +			}
> >  			if (lov != hiv) {
> >  				/*
> >  				 * At this point miv could be equal
> > @@ -97,7 +101,9 @@ int sha1_pos(const unsigned char *sha1, void *table, size_t nr,
> >  			lo = mi + 1;
> >  		mi = lo + (hi - lo) / 2;
> >  	} while (lo < hi);
> > -	return -lo-1;
> > +	if (nr > INT_MAX)
> > +		die("overflow: -1 - %"PRIuMAX, (uintmax_t)lo);
> > +	return -1 - (int)lo;
> >  }
> >
> >  int bsearch_hash(const unsigned char *sha1, const uint32_t *fanout_nbo,
>
Johannes Sixt Oct. 4, 2019, 5:09 p.m. UTC | #3
Am 04.10.19 um 11:55 schrieb Johannes Schindelin:
> On Fri, 4 Oct 2019, Junio C Hamano wrote:
>> These three look good and too similar to each other, which makes me
>> wonder if we want to allow them simply write
>>
>> 	return insert_pos_as_negative_offset(nr);
>>
>> with something like
>>
>> 	static int insert_pos_as_negative_offset(uintmax_t nr)
>> 	{
>> 		if (INT_MAX < nr)
>> 			die("overflow: -1 - %"PRIuMAX, nr);
>> 		return -1 - (int)nr;
>> 	}
>>
>> to avoid repetition.
> 
> I tried not to do that because there are two different data types in
> play: `unsigned int` and `size_t`. But I guess by making this an
> `inline` function, compilers can optimize for the common case and avoid
> casting _twice_.
> 
> Will be fixed in v2,

IMHO, if you don't accompany insert_pos_as_negative_offset() with a
corresponding extract_pos_and_found_condition() and use it everywhere,
it is more obfuscating than necessary.

The *real* problem to solve is to ensure that the index/cache does not
grow so large that 32-bit indexes would be needed. Then the calculation
that you want to hide here cannot overflow.

-- Hannes
Johannes Schindelin Oct. 4, 2019, 9:24 p.m. UTC | #4
Hi Hannes,

On Fri, 4 Oct 2019, Johannes Sixt wrote:

> Am 04.10.19 um 11:55 schrieb Johannes Schindelin:
> > On Fri, 4 Oct 2019, Junio C Hamano wrote:
> >> These three look good and too similar to each other, which makes me
> >> wonder if we want to allow them simply write
> >>
> >> 	return insert_pos_as_negative_offset(nr);
> >>
> >> with something like
> >>
> >> 	static int insert_pos_as_negative_offset(uintmax_t nr)
> >> 	{
> >> 		if (INT_MAX < nr)
> >> 			die("overflow: -1 - %"PRIuMAX, nr);
> >> 		return -1 - (int)nr;
> >> 	}
> >>
> >> to avoid repetition.
> >
> > I tried not to do that because there are two different data types in
> > play: `unsigned int` and `size_t`. But I guess by making this an
> > `inline` function, compilers can optimize for the common case and avoid
> > casting _twice_.
> >
> > Will be fixed in v2,
>
> IMHO, if you don't accompany insert_pos_as_negative_offset() with a
> corresponding extract_pos_and_found_condition() and use it everywhere,
> it is more obfuscating than necessary.

I do disagree here. No overflow checking needs to be performed for `-1 -
<int-value>`. And that's what the opposite of this function really boils
down to.

> The *real* problem to solve is to ensure that the index/cache does not
> grow so large that 32-bit indexes would be needed. Then the calculation
> that you want to hide here cannot overflow.

Well, that may be the real problem of another patch series. This patch
series' problem is to add a job to our Azure Pipeline that builds Git
with Visual Studio, and it patches the code minimally so that it builds
even in `DEVELOPER=1` mode, for good measure.

So I'd like not to dilute the purpose of this patch series.

Thanks,
Dscho
Junio C Hamano Oct. 6, 2019, 12:02 a.m. UTC | #5
Johannes Schindelin <Johannes.Schindelin@gmx.de> writes:

>> IMHO, if you don't accompany insert_pos_as_negative_offset() with a
>> corresponding extract_pos_and_found_condition() and use it everywhere,
>> it is more obfuscating than necessary.
>
> I do disagree here. No overflow checking needs to be performed for `-1 -
> <int-value>`. And that's what the opposite of this function really boils
> down to.

I do not think j6t is referring to the over/underflow issues at all.

The suggestion is that, because insert-pos-as-negative-offset
abstracts away (in addition to the overflow checks) the fact that
"does not exist but here is the location it would be inserted" is
encoded in a certain way (i.e. not just the array index negated, but
also is offset by -1, because we wouldn't be able to say "at the
very beginning at index 0" without the -1 offset), the side that
consumes the encoded "pos" (i.e. "we got a negative, so we know the
element does not exist, and the index into the array we would insert
a new element is computed this way") should be abstracted away, as
it must know that the extra negative offset used when encoding is
"-1".

I think that is a reasonable thing to consider; it is not necessary
for correctness, but contributes to the conceptual clarity (iow, it
can be left as a separate clean-up step done after the series is
done).
Johannes Sixt Oct. 6, 2019, 10:53 a.m. UTC | #6
Am 06.10.19 um 02:02 schrieb Junio C Hamano:
> Johannes Schindelin <Johannes.Schindelin@gmx.de> writes:
> 
>>> IMHO, if you don't accompany insert_pos_as_negative_offset() with a
>>> corresponding extract_pos_and_found_condition() and use it everywhere,
>>> it is more obfuscating than necessary.
>>
>> I do disagree here. No overflow checking needs to be performed for `-1 -
>> <int-value>`. And that's what the opposite of this function really boils
>> down to.
> 
> I do not think j6t is referring to the over/underflow issues at all.
> 
> The suggestion is that, because insert-pos-as-negative-offset
> abstracts away [...] the fact that
> "does not exist but here is the location it would be inserted" is
> encoded in a certain way [...], the side that
> consumes the encoded "pos" [...] should be abstracted away [...].

Thank you, that summarizes my thoughts very well.

> I think that is a reasonable thing to consider; it is not necessary
> for correctness, but contributes to the conceptual clarity (iow, it
> can be left as a separate clean-up step done after the series is
> done).

Thanks, but I disagree with the course of action: I think we should do
both sides at the same time. (And if we aren't ready to do the decoding
side now, then we should delay the encoding side).

Consider someone is looking at the call site without knowing the
detailed meaning of the return value ("What the heck is this -1
business?"), it is a matter to look at the function being called to know
what it is. With another function that does the encoding, it is one
additional hop. That is my reason for saying "more obfuscating than
necessary".

That the helper function would reduce some small code duplication does
not outweigh the obfuscation in my book. That's just MHO, of course.

-- Hannes
Johannes Schindelin Oct. 8, 2019, 12:04 p.m. UTC | #7
Hi Hannes,

On Sun, 6 Oct 2019, Johannes Sixt wrote:

> Am 06.10.19 um 02:02 schrieb Junio C Hamano:
> > Johannes Schindelin <Johannes.Schindelin@gmx.de> writes:
> >
> >>> IMHO, if you don't accompany insert_pos_as_negative_offset() with a
> >>> corresponding extract_pos_and_found_condition() and use it everywhere,
> >>> it is more obfuscating than necessary.
> >>
> >> I do disagree here. No overflow checking needs to be performed for `-1 -
> >> <int-value>`. And that's what the opposite of this function really boils
> >> down to.
> >
> > I do not think j6t is referring to the over/underflow issues at all.
> >
> > The suggestion is that, because insert-pos-as-negative-offset
> > abstracts away [...] the fact that
> > "does not exist but here is the location it would be inserted" is
> > encoded in a certain way [...], the side that
> > consumes the encoded "pos" [...] should be abstracted away [...].
>
> Thank you, that summarizes my thoughts very well.
>
> > I think that is a reasonable thing to consider; it is not necessary
> > for correctness, but contributes to the conceptual clarity (iow, it
> > can be left as a separate clean-up step done after the series is
> > done).
>
> Thanks, but I disagree with the course of action: I think we should do
> both sides at the same time. (And if we aren't ready to do the decoding
> side now, then we should delay the encoding side).
>
> Consider someone is looking at the call site without knowing the
> detailed meaning of the return value ("What the heck is this -1
> business?"), it is a matter to look at the function being called to know
> what it is. With another function that does the encoding, it is one
> additional hop. That is my reason for saying "more obfuscating than
> necessary".
>
> That the helper function would reduce some small code duplication does
> not outweigh the obfuscation in my book. That's just MHO, of course.

So you got what you wished for:
https://public-inbox.org/git/pull.378.git.gitgitgadget@gmail.com

Care to review?

Thanks,
Dscho
Johannes Sixt Oct. 8, 2019, 9:13 p.m. UTC | #8
Am 08.10.19 um 14:04 schrieb Johannes Schindelin:
> So you got what you wished for:
> https://public-inbox.org/git/pull.378.git.gitgitgadget@gmail.com

After having seen the result I do not wish for it anymore. (Not that I
had "wished" for it in the first place...) It does not make the result
any more readable than the original.

I do wish you had rejected Junio's suggestion to introduce
index_pos_to_insert_pos(). It doesn't make the code a lot more readable,
either, in my eyes.

-- Hannes
diff mbox series

Patch

diff --git a/read-cache.c b/read-cache.c
index c701f7f8b8..97745c2a31 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1275,8 +1275,11 @@  static int add_index_entry_with_check(struct index_state *istate, struct cache_e
 	 * we can avoid searching for it.
 	 */
 	if (istate->cache_nr > 0 &&
-		strcmp(ce->name, istate->cache[istate->cache_nr - 1]->name) > 0)
-		pos = -istate->cache_nr - 1;
+		strcmp(ce->name, istate->cache[istate->cache_nr - 1]->name) > 0) {
+		if (istate->cache_nr > INT_MAX)
+			die("overflow: -1 - %u", istate->cache_nr);
+		pos = -1 - (int)istate->cache_nr;
+	}
 	else
 		pos = index_name_stage_pos(istate, ce->name, ce_namelen(ce), ce_stage(ce));
 
@@ -1894,7 +1897,7 @@  static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
 	/*
 	 * Account for potential alignment differences.
 	 */
-	per_entry += align_padding_size(sizeof(struct cache_entry), -sizeof(struct ondisk_cache_entry));
+	per_entry += align_padding_size(per_entry, 0);
 	return ondisk_size + entries * per_entry;
 }
 
diff --git a/sha1-lookup.c b/sha1-lookup.c
index 796ab68da8..bb786b5420 100644
--- a/sha1-lookup.c
+++ b/sha1-lookup.c
@@ -69,8 +69,12 @@  int sha1_pos(const unsigned char *sha1, void *table, size_t nr,
 			miv = take2(sha1 + ofs);
 			if (miv < lov)
 				return -1;
-			if (hiv < miv)
-				return -1 - nr;
+			if (hiv < miv) {
+				if (nr > INT_MAX)
+					die("overflow: -1 - %"PRIuMAX,
+					    (uintmax_t)nr);
+				return -1 - (int)nr;
+			}
 			if (lov != hiv) {
 				/*
 				 * At this point miv could be equal
@@ -97,7 +101,9 @@  int sha1_pos(const unsigned char *sha1, void *table, size_t nr,
 			lo = mi + 1;
 		mi = lo + (hi - lo) / 2;
 	} while (lo < hi);
-	return -lo-1;
+	if (nr > INT_MAX)
+		die("overflow: -1 - %"PRIuMAX, (uintmax_t)lo);
+	return -1 - (int)lo;
 }
 
 int bsearch_hash(const unsigned char *sha1, const uint32_t *fanout_nbo,