Message ID | 8800320590e4d7218a80f80abca23a7f44b8747d.1569837329.git.gitgitgadget@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | ci: include a Visual Studio build & test in our Azure Pipeline | expand |
"Johannes Schindelin via GitGitGadget" <gitgitgadget@gmail.com> writes: > While at it, we take care of reporting overflows (which are unlikely, > but hey, defensive programming is good!). > > We _also_ take pains of casting the unsigned value to signed: otherwise, > the signed operand (i.e. the `-1`) would be cast to unsigned before > doing the arithmetic. These three look good and too similar to each other, which makes me wonder if we want to allow them simply write return insert_pos_as_negative_offset(nr); with something like static int insert_pos_as_negative_offset(uintmax_t nr) { if (INT_MAX < nr) die("overflow: -1 - %"PRIuMAX, nr); return -1 - (int)nr; } to avoid repetition. > Helped-by: Denton Liu <liu.denton@gmail.com> > Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de> > --- > read-cache.c | 9 ++++++--- > sha1-lookup.c | 12 +++++++++--- > 2 files changed, 15 insertions(+), 6 deletions(-) > > diff --git a/read-cache.c b/read-cache.c > index c701f7f8b8..97745c2a31 100644 > --- a/read-cache.c > +++ b/read-cache.c > @@ -1275,8 +1275,11 @@ static int add_index_entry_with_check(struct index_state *istate, struct cache_e > * we can avoid searching for it. > */ > if (istate->cache_nr > 0 && > - strcmp(ce->name, istate->cache[istate->cache_nr - 1]->name) > 0) > - pos = -istate->cache_nr - 1; > + strcmp(ce->name, istate->cache[istate->cache_nr - 1]->name) > 0) { > + if (istate->cache_nr > INT_MAX) > + die("overflow: -1 - %u", istate->cache_nr); > + pos = -1 - (int)istate->cache_nr; > + } > else > pos = index_name_stage_pos(istate, ce->name, ce_namelen(ce), ce_stage(ce)); > > @@ -1894,7 +1897,7 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries) > /* > * Account for potential alignment differences. > */ > - per_entry += align_padding_size(sizeof(struct cache_entry), -sizeof(struct ondisk_cache_entry)); > + per_entry += align_padding_size(per_entry, 0); > return ondisk_size + entries * per_entry; > } > > diff --git a/sha1-lookup.c b/sha1-lookup.c > index 796ab68da8..bb786b5420 100644 > --- a/sha1-lookup.c > +++ b/sha1-lookup.c > @@ -69,8 +69,12 @@ int sha1_pos(const unsigned char *sha1, void *table, size_t nr, > miv = take2(sha1 + ofs); > if (miv < lov) > return -1; > - if (hiv < miv) > - return -1 - nr; > + if (hiv < miv) { > + if (nr > INT_MAX) > + die("overflow: -1 - %"PRIuMAX, > + (uintmax_t)nr); > + return -1 - (int)nr; > + } > if (lov != hiv) { > /* > * At this point miv could be equal > @@ -97,7 +101,9 @@ int sha1_pos(const unsigned char *sha1, void *table, size_t nr, > lo = mi + 1; > mi = lo + (hi - lo) / 2; > } while (lo < hi); > - return -lo-1; > + if (nr > INT_MAX) > + die("overflow: -1 - %"PRIuMAX, (uintmax_t)lo); > + return -1 - (int)lo; > } > > int bsearch_hash(const unsigned char *sha1, const uint32_t *fanout_nbo,
Hi Junio, On Fri, 4 Oct 2019, Junio C Hamano wrote: > "Johannes Schindelin via GitGitGadget" <gitgitgadget@gmail.com> > writes: > > > While at it, we take care of reporting overflows (which are unlikely, > > but hey, defensive programming is good!). > > > > We _also_ take pains of casting the unsigned value to signed: otherwise, > > the signed operand (i.e. the `-1`) would be cast to unsigned before > > doing the arithmetic. > > These three look good and too similar to each other, which makes me > wonder if we want to allow them simply write > > return insert_pos_as_negative_offset(nr); > > with something like > > static int insert_pos_as_negative_offset(uintmax_t nr) > { > if (INT_MAX < nr) > die("overflow: -1 - %"PRIuMAX, nr); > return -1 - (int)nr; > } > > to avoid repetition. I tried not to do that because there are two different data types in play: `unsigned int` and `size_t`. But I guess by making this an `inline` function, compilers can optimize for the common case and avoid casting _twice_. Will be fixed in v2, Dscho > > > Helped-by: Denton Liu <liu.denton@gmail.com> > > Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de> > > --- > > read-cache.c | 9 ++++++--- > > sha1-lookup.c | 12 +++++++++--- > > 2 files changed, 15 insertions(+), 6 deletions(-) > > > > diff --git a/read-cache.c b/read-cache.c > > index c701f7f8b8..97745c2a31 100644 > > --- a/read-cache.c > > +++ b/read-cache.c > > @@ -1275,8 +1275,11 @@ static int add_index_entry_with_check(struct index_state *istate, struct cache_e > > * we can avoid searching for it. > > */ > > if (istate->cache_nr > 0 && > > - strcmp(ce->name, istate->cache[istate->cache_nr - 1]->name) > 0) > > - pos = -istate->cache_nr - 1; > > + strcmp(ce->name, istate->cache[istate->cache_nr - 1]->name) > 0) { > > + if (istate->cache_nr > INT_MAX) > > + die("overflow: -1 - %u", istate->cache_nr); > > + pos = -1 - (int)istate->cache_nr; > > + } > > else > > pos = index_name_stage_pos(istate, ce->name, ce_namelen(ce), ce_stage(ce)); > > > > @@ -1894,7 +1897,7 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries) > > /* > > * Account for potential alignment differences. > > */ > > - per_entry += align_padding_size(sizeof(struct cache_entry), -sizeof(struct ondisk_cache_entry)); > > + per_entry += align_padding_size(per_entry, 0); > > return ondisk_size + entries * per_entry; > > } > > > > diff --git a/sha1-lookup.c b/sha1-lookup.c > > index 796ab68da8..bb786b5420 100644 > > --- a/sha1-lookup.c > > +++ b/sha1-lookup.c > > @@ -69,8 +69,12 @@ int sha1_pos(const unsigned char *sha1, void *table, size_t nr, > > miv = take2(sha1 + ofs); > > if (miv < lov) > > return -1; > > - if (hiv < miv) > > - return -1 - nr; > > + if (hiv < miv) { > > + if (nr > INT_MAX) > > + die("overflow: -1 - %"PRIuMAX, > > + (uintmax_t)nr); > > + return -1 - (int)nr; > > + } > > if (lov != hiv) { > > /* > > * At this point miv could be equal > > @@ -97,7 +101,9 @@ int sha1_pos(const unsigned char *sha1, void *table, size_t nr, > > lo = mi + 1; > > mi = lo + (hi - lo) / 2; > > } while (lo < hi); > > - return -lo-1; > > + if (nr > INT_MAX) > > + die("overflow: -1 - %"PRIuMAX, (uintmax_t)lo); > > + return -1 - (int)lo; > > } > > > > int bsearch_hash(const unsigned char *sha1, const uint32_t *fanout_nbo, >
Am 04.10.19 um 11:55 schrieb Johannes Schindelin: > On Fri, 4 Oct 2019, Junio C Hamano wrote: >> These three look good and too similar to each other, which makes me >> wonder if we want to allow them simply write >> >> return insert_pos_as_negative_offset(nr); >> >> with something like >> >> static int insert_pos_as_negative_offset(uintmax_t nr) >> { >> if (INT_MAX < nr) >> die("overflow: -1 - %"PRIuMAX, nr); >> return -1 - (int)nr; >> } >> >> to avoid repetition. > > I tried not to do that because there are two different data types in > play: `unsigned int` and `size_t`. But I guess by making this an > `inline` function, compilers can optimize for the common case and avoid > casting _twice_. > > Will be fixed in v2, IMHO, if you don't accompany insert_pos_as_negative_offset() with a corresponding extract_pos_and_found_condition() and use it everywhere, it is more obfuscating than necessary. The *real* problem to solve is to ensure that the index/cache does not grow so large that 32-bit indexes would be needed. Then the calculation that you want to hide here cannot overflow. -- Hannes
Hi Hannes, On Fri, 4 Oct 2019, Johannes Sixt wrote: > Am 04.10.19 um 11:55 schrieb Johannes Schindelin: > > On Fri, 4 Oct 2019, Junio C Hamano wrote: > >> These three look good and too similar to each other, which makes me > >> wonder if we want to allow them simply write > >> > >> return insert_pos_as_negative_offset(nr); > >> > >> with something like > >> > >> static int insert_pos_as_negative_offset(uintmax_t nr) > >> { > >> if (INT_MAX < nr) > >> die("overflow: -1 - %"PRIuMAX, nr); > >> return -1 - (int)nr; > >> } > >> > >> to avoid repetition. > > > > I tried not to do that because there are two different data types in > > play: `unsigned int` and `size_t`. But I guess by making this an > > `inline` function, compilers can optimize for the common case and avoid > > casting _twice_. > > > > Will be fixed in v2, > > IMHO, if you don't accompany insert_pos_as_negative_offset() with a > corresponding extract_pos_and_found_condition() and use it everywhere, > it is more obfuscating than necessary. I do disagree here. No overflow checking needs to be performed for `-1 - <int-value>`. And that's what the opposite of this function really boils down to. > The *real* problem to solve is to ensure that the index/cache does not > grow so large that 32-bit indexes would be needed. Then the calculation > that you want to hide here cannot overflow. Well, that may be the real problem of another patch series. This patch series' problem is to add a job to our Azure Pipeline that builds Git with Visual Studio, and it patches the code minimally so that it builds even in `DEVELOPER=1` mode, for good measure. So I'd like not to dilute the purpose of this patch series. Thanks, Dscho
Johannes Schindelin <Johannes.Schindelin@gmx.de> writes: >> IMHO, if you don't accompany insert_pos_as_negative_offset() with a >> corresponding extract_pos_and_found_condition() and use it everywhere, >> it is more obfuscating than necessary. > > I do disagree here. No overflow checking needs to be performed for `-1 - > <int-value>`. And that's what the opposite of this function really boils > down to. I do not think j6t is referring to the over/underflow issues at all. The suggestion is that, because insert-pos-as-negative-offset abstracts away (in addition to the overflow checks) the fact that "does not exist but here is the location it would be inserted" is encoded in a certain way (i.e. not just the array index negated, but also is offset by -1, because we wouldn't be able to say "at the very beginning at index 0" without the -1 offset), the side that consumes the encoded "pos" (i.e. "we got a negative, so we know the element does not exist, and the index into the array we would insert a new element is computed this way") should be abstracted away, as it must know that the extra negative offset used when encoding is "-1". I think that is a reasonable thing to consider; it is not necessary for correctness, but contributes to the conceptual clarity (iow, it can be left as a separate clean-up step done after the series is done).
Am 06.10.19 um 02:02 schrieb Junio C Hamano: > Johannes Schindelin <Johannes.Schindelin@gmx.de> writes: > >>> IMHO, if you don't accompany insert_pos_as_negative_offset() with a >>> corresponding extract_pos_and_found_condition() and use it everywhere, >>> it is more obfuscating than necessary. >> >> I do disagree here. No overflow checking needs to be performed for `-1 - >> <int-value>`. And that's what the opposite of this function really boils >> down to. > > I do not think j6t is referring to the over/underflow issues at all. > > The suggestion is that, because insert-pos-as-negative-offset > abstracts away [...] the fact that > "does not exist but here is the location it would be inserted" is > encoded in a certain way [...], the side that > consumes the encoded "pos" [...] should be abstracted away [...]. Thank you, that summarizes my thoughts very well. > I think that is a reasonable thing to consider; it is not necessary > for correctness, but contributes to the conceptual clarity (iow, it > can be left as a separate clean-up step done after the series is > done). Thanks, but I disagree with the course of action: I think we should do both sides at the same time. (And if we aren't ready to do the decoding side now, then we should delay the encoding side). Consider someone is looking at the call site without knowing the detailed meaning of the return value ("What the heck is this -1 business?"), it is a matter to look at the function being called to know what it is. With another function that does the encoding, it is one additional hop. That is my reason for saying "more obfuscating than necessary". That the helper function would reduce some small code duplication does not outweigh the obfuscation in my book. That's just MHO, of course. -- Hannes
Hi Hannes, On Sun, 6 Oct 2019, Johannes Sixt wrote: > Am 06.10.19 um 02:02 schrieb Junio C Hamano: > > Johannes Schindelin <Johannes.Schindelin@gmx.de> writes: > > > >>> IMHO, if you don't accompany insert_pos_as_negative_offset() with a > >>> corresponding extract_pos_and_found_condition() and use it everywhere, > >>> it is more obfuscating than necessary. > >> > >> I do disagree here. No overflow checking needs to be performed for `-1 - > >> <int-value>`. And that's what the opposite of this function really boils > >> down to. > > > > I do not think j6t is referring to the over/underflow issues at all. > > > > The suggestion is that, because insert-pos-as-negative-offset > > abstracts away [...] the fact that > > "does not exist but here is the location it would be inserted" is > > encoded in a certain way [...], the side that > > consumes the encoded "pos" [...] should be abstracted away [...]. > > Thank you, that summarizes my thoughts very well. > > > I think that is a reasonable thing to consider; it is not necessary > > for correctness, but contributes to the conceptual clarity (iow, it > > can be left as a separate clean-up step done after the series is > > done). > > Thanks, but I disagree with the course of action: I think we should do > both sides at the same time. (And if we aren't ready to do the decoding > side now, then we should delay the encoding side). > > Consider someone is looking at the call site without knowing the > detailed meaning of the return value ("What the heck is this -1 > business?"), it is a matter to look at the function being called to know > what it is. With another function that does the encoding, it is one > additional hop. That is my reason for saying "more obfuscating than > necessary". > > That the helper function would reduce some small code duplication does > not outweigh the obfuscation in my book. That's just MHO, of course. So you got what you wished for: https://public-inbox.org/git/pull.378.git.gitgitgadget@gmail.com Care to review? Thanks, Dscho
Am 08.10.19 um 14:04 schrieb Johannes Schindelin: > So you got what you wished for: > https://public-inbox.org/git/pull.378.git.gitgitgadget@gmail.com After having seen the result I do not wish for it anymore. (Not that I had "wished" for it in the first place...) It does not make the result any more readable than the original. I do wish you had rejected Junio's suggestion to introduce index_pos_to_insert_pos(). It doesn't make the code a lot more readable, either, in my eyes. -- Hannes
diff --git a/read-cache.c b/read-cache.c index c701f7f8b8..97745c2a31 100644 --- a/read-cache.c +++ b/read-cache.c @@ -1275,8 +1275,11 @@ static int add_index_entry_with_check(struct index_state *istate, struct cache_e * we can avoid searching for it. */ if (istate->cache_nr > 0 && - strcmp(ce->name, istate->cache[istate->cache_nr - 1]->name) > 0) - pos = -istate->cache_nr - 1; + strcmp(ce->name, istate->cache[istate->cache_nr - 1]->name) > 0) { + if (istate->cache_nr > INT_MAX) + die("overflow: -1 - %u", istate->cache_nr); + pos = -1 - (int)istate->cache_nr; + } else pos = index_name_stage_pos(istate, ce->name, ce_namelen(ce), ce_stage(ce)); @@ -1894,7 +1897,7 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries) /* * Account for potential alignment differences. */ - per_entry += align_padding_size(sizeof(struct cache_entry), -sizeof(struct ondisk_cache_entry)); + per_entry += align_padding_size(per_entry, 0); return ondisk_size + entries * per_entry; } diff --git a/sha1-lookup.c b/sha1-lookup.c index 796ab68da8..bb786b5420 100644 --- a/sha1-lookup.c +++ b/sha1-lookup.c @@ -69,8 +69,12 @@ int sha1_pos(const unsigned char *sha1, void *table, size_t nr, miv = take2(sha1 + ofs); if (miv < lov) return -1; - if (hiv < miv) - return -1 - nr; + if (hiv < miv) { + if (nr > INT_MAX) + die("overflow: -1 - %"PRIuMAX, + (uintmax_t)nr); + return -1 - (int)nr; + } if (lov != hiv) { /* * At this point miv could be equal @@ -97,7 +101,9 @@ int sha1_pos(const unsigned char *sha1, void *table, size_t nr, lo = mi + 1; mi = lo + (hi - lo) / 2; } while (lo < hi); - return -lo-1; + if (nr > INT_MAX) + die("overflow: -1 - %"PRIuMAX, (uintmax_t)lo); + return -1 - (int)lo; } int bsearch_hash(const unsigned char *sha1, const uint32_t *fanout_nbo,